diff --git a/.bazelrc b/.bazelrc index fc2995dc838c5..3656a86eb364c 100644 --- a/.bazelrc +++ b/.bazelrc @@ -2,7 +2,11 @@ build --cxxopt=--std=c++17 build --copt=-I. # Bazel does not support including its cc_library targets as system # headers. We work around this for generated code +<<<<<<< HEAD # (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a +======= +# (e.g. c10/macros/cmake_macros.h) by making the generated directory a +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # system include path. build --copt=-isystem --copt bazel-out/k8-fastbuild/bin build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index b25f3b21e8eb1..9a178300266b7 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -3,6 +3,7 @@ set -eux -o pipefail GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} +<<<<<<< HEAD # Set CUDA architecture lists to match x86 build_cuda.sh if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then export TORCH_CUDA_ARCH_LIST="8.0;9.0" @@ -19,6 +20,10 @@ if [[ "$DESIRED_CUDA" == *"13"* ]]; then export TORCH_NVCC_FLAGS="-compress-mode=size" # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801 export BUILD_BUNDLE_PTXAS=1 +======= +if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then + export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" @@ -32,6 +37,7 @@ cd / # on the mounted pytorch repo git config --global --add safe.directory /pytorch pip install -r /pytorch/requirements.txt +<<<<<<< HEAD pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." @@ -50,4 +56,16 @@ else fi python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda +======= +pip install auditwheel==6.2.0 +if [ "$DESIRED_CUDA" = "cpu" ]; then + echo "BASE_CUDA_VERSION is not set. Building cpu wheel." + #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files + USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn +else + echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" + export USE_SYSTEM_NCCL=1 + #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files + USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index a99e5f8f65659..fe0a76c275c5c 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -13,6 +13,52 @@ def list_dir(path: str) -> list[str]: return check_output(["ls", "-1", path]).decode().split("\n") +<<<<<<< HEAD +======= +def build_ArmComputeLibrary() -> None: + """ + Using ArmComputeLibrary for aarch64 PyTorch + """ + print("Building Arm Compute Library") + acl_build_flags = [ + "debug=0", + "neon=1", + "opencl=0", + "os=linux", + "openmp=1", + "cppthreads=0", + "arch=armv8a", + "multi_isa=1", + "fixed_format_kernels=1", + "build=native", + ] + acl_install_dir = "/acl" + acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary") + if os.path.isdir(acl_install_dir): + shutil.rmtree(acl_install_dir) + if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)): + check_call( + [ + "git", + "clone", + "https://github.com/ARM-software/ComputeLibrary.git", + "-b", + "v25.02", + "--depth", + "1", + "--shallow-submodules", + ] + ) + + check_call( + ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags, + cwd=acl_checkout_dir, + ) + for d in ["arm_compute", "include", "utils", "support", "src", "build"]: + shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}") + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def replace_tag(filename) -> None: with open(filename) as f: lines = f.readlines() @@ -26,6 +72,7 @@ def replace_tag(filename) -> None: f.writelines(lines) +<<<<<<< HEAD def patch_library_rpath( folder: str, lib_name: str, @@ -88,11 +135,14 @@ def copy_and_patch_library( patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def package_cuda_wheel(wheel_path, desired_cuda) -> None: """ Package the cuda wheel libraries """ folder = os.path.dirname(wheel_path) +<<<<<<< HEAD os.mkdir(f"{folder}/tmp") os.system(f"unzip {wheel_path} -d {folder}/tmp") # Delete original wheel since it will be repackaged @@ -206,6 +256,57 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: # Copy libraries to unzipped_folder/torch/lib for lib_path in libs_to_copy: copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) +======= + wheelname = os.path.basename(wheel_path) + os.mkdir(f"{folder}/tmp") + os.system(f"unzip {wheel_path} -d {folder}/tmp") + libs_to_copy = [ + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", + "/usr/local/cuda/lib64/libcudnn.so.9", + "/usr/local/cuda/lib64/libcublas.so.12", + "/usr/local/cuda/lib64/libcublasLt.so.12", + "/usr/local/cuda/lib64/libcudart.so.12", + "/usr/local/cuda/lib64/libcufft.so.11", + "/usr/local/cuda/lib64/libcusparse.so.12", + "/usr/local/cuda/lib64/libcusparseLt.so.0", + "/usr/local/cuda/lib64/libcusolver.so.11", + "/usr/local/cuda/lib64/libcurand.so.10", + "/usr/local/cuda/lib64/libnccl.so.2", + "/usr/local/cuda/lib64/libnvJitLink.so.12", + "/usr/local/cuda/lib64/libnvrtc.so.12", + "/usr/local/cuda/lib64/libcudnn_adv.so.9", + "/usr/local/cuda/lib64/libcudnn_cnn.so.9", + "/usr/local/cuda/lib64/libcudnn_graph.so.9", + "/usr/local/cuda/lib64/libcudnn_ops.so.9", + "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9", + "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9", + "/usr/local/cuda/lib64/libcudnn_heuristic.so.9", + "/lib64/libgomp.so.1", + "/usr/lib64/libgfortran.so.5", + "/acl/build/libarm_compute.so", + "/acl/build/libarm_compute_graph.so", + "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_lapack_core.so.0", + "/usr/local/lib/libnvpl_blas_core.so.0", + ] + + if "129" in desired_cuda: + libs_to_copy += [ + "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9", + "/usr/local/cuda/lib64/libcufile.so.0", + "/usr/local/cuda/lib64/libcufile_rdma.so.1", + ] + + # Copy libraries to unzipped_folder/a/lib + for lib_path in libs_to_copy: + lib_name = os.path.basename(lib_path) + shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}") + os.system( + f"cd {folder}/tmp/torch/lib/; " + f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}" + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Make sure the wheel is tagged with manylinux_2_28 for f in os.scandir(f"{folder}/tmp/"): @@ -213,8 +314,19 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: replace_tag(f"{f.path}/WHEEL") break +<<<<<<< HEAD os.system(f"wheel pack {folder}/tmp/ -d {folder}") os.system(f"rm -rf {folder}/tmp/") +======= + os.mkdir(f"{folder}/cuda_wheel") + os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *") + shutil.move( + f"{folder}/cuda_wheel/{wheelname}", + f"{folder}/{wheelname}", + copy_function=shutil.copy2, + ) + os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def complete_wheel(folder: str) -> str: @@ -237,7 +349,18 @@ def complete_wheel(folder: str) -> str: f"/{folder}/dist/{repaired_wheel_name}", ) else: +<<<<<<< HEAD repaired_wheel_name = list_dir(f"/{folder}/dist")[0] +======= + repaired_wheel_name = wheel_name.replace( + "linux_aarch64", "manylinux_2_28_aarch64" + ) + print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}") + os.rename( + f"/{folder}/dist/{wheel_name}", + f"/{folder}/dist/{repaired_wheel_name}", + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) print(f"Copying {repaired_wheel_name} to artifacts") shutil.copy2( @@ -274,6 +397,7 @@ def parse_arguments(): ).decode() print("Building PyTorch wheel") +<<<<<<< HEAD build_vars = "" # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: @@ -288,6 +412,12 @@ def parse_arguments(): else: print("Configuring build for bundled NVIDIA libraries") # Keep existing static linking approach - already configured above +======= + build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) + if enable_cuda: + build_vars = "MAX_JOBS=5 " + build_vars +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") desired_cuda = os.getenv("DESIRED_CUDA") @@ -313,6 +443,7 @@ def parse_arguments(): build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " if enable_mkldnn: +<<<<<<< HEAD print("build pytorch with mkldnn+acl backend") build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " build_vars += "ACL_ROOT_DIR=/acl " @@ -324,6 +455,25 @@ def parse_arguments(): print("build pytorch without mkldnn backend") os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation") +======= + build_ArmComputeLibrary() + print("build pytorch with mkldnn+acl backend") + build_vars += ( + "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " + "ACL_ROOT_DIR=/acl " + "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH " + "ACL_INCLUDE_DIR=/acl/build " + "ACL_LIBRARY=/acl/build " + ) + if enable_cuda: + build_vars += "BLAS=NVPL " + else: + build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS " + else: + print("build pytorch without mkldnn backend") + + os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if enable_cuda: print("Updating Cuda Dependency") filename = os.listdir("/pytorch/dist/") diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py index a157ec57b574a..e1809a8528aae 100755 --- a/.ci/aarch64_linux/build_aarch64_wheel.py +++ b/.ci/aarch64_linux/build_aarch64_wheel.py @@ -241,7 +241,11 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): try: with socket.create_connection((addr, port), timeout=timeout): return +<<<<<<< HEAD except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 +======= + except (ConnectionRefusedError, socket.timeout): # noqa: PERF203 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if i == attempt_cnt - 1: raise time.sleep(timeout) @@ -299,6 +303,43 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: ) +<<<<<<< HEAD +======= +def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: + print("Building OpenBLAS") + host.run_cmd( + f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}" + ) + make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8" + host.run_cmd( + f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS" + ) + + +def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None: + print("Building Arm Compute Library") + acl_build_flags = " ".join( + [ + "debug=0", + "neon=1", + "opencl=0", + "os=linux", + "openmp=1", + "cppthreads=0", + "arch=armv8a", + "multi_isa=1", + "fixed_format_kernels=1", + "build=native", + ] + ) + host.run_cmd( + f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}" + ) + + host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None: host.run_cmd("pip3 install auditwheel") host.run_cmd( @@ -404,11 +445,21 @@ def build_torchvision( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: +<<<<<<< HEAD build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation") +======= + build_vars += ( + f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" + ) + if host.using_docker(): + build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" + + host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vision_wheel_name = host.list_dir("vision/dist")[0] embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name)) @@ -459,11 +510,21 @@ def build_torchdata( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: +<<<<<<< HEAD build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation") +======= + build_vars += ( + f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" + ) + if host.using_docker(): + build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" + + host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel_name = host.list_dir("data/dist")[0] embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name)) @@ -515,11 +576,21 @@ def build_torchtext( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: +<<<<<<< HEAD build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation") +======= + build_vars += ( + f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" + ) + if host.using_docker(): + build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" + + host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel_name = host.list_dir("text/dist")[0] embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name)) @@ -573,14 +644,24 @@ def build_torchaudio( ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: +<<<<<<< HEAD build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" +======= + build_vars += ( + f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}" + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd( f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \ && ./packaging/ffmpeg/build.sh \ +<<<<<<< HEAD && {build_vars} python3 -m build --wheel --no-isolation" +======= + && {build_vars} python3 setup.py bdist_wheel" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) wheel_name = host.list_dir("audio/dist")[0] @@ -666,6 +747,10 @@ def start_build( configure_system( host, compiler=compiler, use_conda=use_conda, python_version=python_version ) +<<<<<<< HEAD +======= + build_OpenBLAS(host, git_clone_flags) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if host.using_docker(): print("Move libgfortant.a into a standard location") @@ -688,12 +773,19 @@ def start_build( f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}" ) +<<<<<<< HEAD host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh") print("Building PyTorch wheel") build_opts = "" if pytorch_build_number is not None: build_opts += f" -C--build-option=--build-number={pytorch_build_number}" +======= + print("Building PyTorch wheel") + build_opts = "" + if pytorch_build_number is not None: + build_opts += f" --build-number {pytorch_build_number}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Breakpad build fails on aarch64 build_vars = "USE_BREAKPAD=0 " if branch == "nightly": @@ -710,6 +802,7 @@ def start_build( if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" if enable_mkldnn: +<<<<<<< HEAD host.run_cmd("pytorch/.ci/docker/common/install_acl.sh") print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" @@ -722,6 +815,17 @@ def start_build( print("Repair the wheel") pytorch_wheel_name = host.list_dir("pytorch/dist")[0] ld_library_path = "/acl/build:$HOME/pytorch/build/lib" +======= + build_ArmComputeLibrary(host, git_clone_flags) + print("build pytorch with mkldnn+acl backend") + build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" + host.run_cmd( + f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}" + ) + print("Repair the wheel") + pytorch_wheel_name = host.list_dir("pytorch/dist")[0] + ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) host.run_cmd( f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}" ) @@ -733,7 +837,11 @@ def start_build( else: print("build pytorch without mkldnn backend") host.run_cmd( +<<<<<<< HEAD f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}" +======= + f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) print("Deleting build folder") @@ -877,7 +985,11 @@ def terminate_instances(instance_type: str) -> None: def parse_arguments(): from argparse import ArgumentParser +<<<<<<< HEAD parser = ArgumentParser("Build and test AARCH64 wheels using EC2") +======= + parser = ArgumentParser("Builid and test AARCH64 wheels using EC2") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) parser.add_argument("--key-name", type=str) parser.add_argument("--debug", action="store_true") parser.add_argument("--build-only", action="store_true") @@ -974,7 +1086,11 @@ def parse_arguments(): install_condaforge_python(host, args.python_version) sys.exit(0) +<<<<<<< HEAD python_version = args.python_version if args.python_version is not None else "3.10" +======= + python_version = args.python_version if args.python_version is not None else "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if args.use_torch_from_pypi: configure_system(host, compiler=args.compiler, python_version=python_version) diff --git a/.ci/docker/README.md b/.ci/docker/README.md index 5a97a0a3c2d46..a795edf2c0b9b 100644 --- a/.ci/docker/README.md +++ b/.ci/docker/README.md @@ -36,6 +36,7 @@ See `build.sh` for valid build environments (it's the giant switch). # Set flags (see build.sh) and build image sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest ``` +<<<<<<< HEAD ## [Guidance] Adding a New Base Docker Image @@ -137,3 +138,5 @@ If your new Docker image needs a library installed from a specific pinned commit The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the pinned commit updates. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile index ce7803cf9acd2..87fbc51917829 100644 --- a/.ci/docker/almalinux/Dockerfile +++ b/.ci/docker/almalinux/Dockerfile @@ -64,6 +64,7 @@ FROM cuda as cuda12.9 RUN bash ./install_cuda.sh 12.9 ENV DESIRED_CUDA=12.9 +<<<<<<< HEAD FROM cuda as cuda13.0 RUN bash ./install_cuda.sh 13.0 ENV DESIRED_CUDA=13.0 @@ -71,6 +72,10 @@ ENV DESIRED_CUDA=13.0 FROM ${ROCM_IMAGE} as rocm ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} +======= +FROM ${ROCM_IMAGE} as rocm +ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ADD ./common/install_mkl.sh install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh ENV MKLROOT /opt/intel @@ -81,10 +86,17 @@ ADD ./common/install_mnist.sh install_mnist.sh RUN bash ./install_mnist.sh FROM base as all_cuda +<<<<<<< HEAD COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6 COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8 COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9 COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0 +======= +COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8 +COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6 +COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8 +COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Final step FROM ${BASE_TARGET} as final diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh index ad234ce1ffb93..50aeb082d09b5 100755 --- a/.ci/docker/almalinux/build.sh +++ b/.ci/docker/almalinux/build.sh @@ -36,12 +36,15 @@ case ${DOCKER_TAG_PREFIX} in ;; rocm*) BASE_TARGET=rocm +<<<<<<< HEAD PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" # add gfx950, gfx115x conditionally starting in ROCm 7.0 if [[ "$ROCM_VERSION" == *"7.0"* ]]; then PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" fi EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; *) echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 54339e5efbbde..054853b44efcd 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -56,14 +56,20 @@ elif [[ "$image" == *-noble* ]]; then UBUNTU_VERSION=24.04 elif [[ "$image" == *ubuntu* ]]; then extract_version_from_image_name ubuntu UBUNTU_VERSION +<<<<<<< HEAD elif [[ "$image" == *centos* ]]; then extract_version_from_image_name centos CENTOS_VERSION +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [ -n "${UBUNTU_VERSION}" ]; then OS="ubuntu" +<<<<<<< HEAD elif [ -n "${CENTOS_VERSION}" ]; then OS="centos" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else echo "Unable to derive operating system base..." exit 1 @@ -80,6 +86,7 @@ elif [[ "$image" == *cuda*linter* ]]; then elif [[ "$image" == *linter* ]]; then # Use a separate Dockerfile for linter to keep a small image size DOCKERFILE="linter/Dockerfile" +<<<<<<< HEAD elif [[ "$image" == *riscv* ]]; then # Use RISC-V specific Dockerfile DOCKERFILE="ubuntu-cross-riscv/Dockerfile" @@ -90,6 +97,15 @@ _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96 if [[ "$image" == *rocm* ]]; then _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77 +======= +fi + +_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb +_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b +if [[ "$image" == *rocm* ]]; then + _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6 + _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi tag=$(echo $image | awk -F':' '{print $2}') @@ -98,6 +114,7 @@ tag=$(echo $image | awk -F':' '{print $2}') # configuration, so we hardcode everything here rather than do it # from scratch case "$tag" in +<<<<<<< HEAD pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11) CUDA_VERSION=12.4 ANACONDA_PYTHON_VERSION=3.10 @@ -121,6 +138,11 @@ case "$tag" in ;; pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11) CUDA_VERSION=13.0.0 +======= + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes @@ -131,6 +153,10 @@ case "$tag" in ;; pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) CUDA_VERSION=12.8.1 +<<<<<<< HEAD +======= + CUDNN_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=9 VISION=yes @@ -140,18 +166,92 @@ case "$tag" in TRITON=yes INDUCTOR_BENCHMARKS=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm) CUDA_VERSION=12.8.1 ANACONDA_PYTHON_VERSION=3.12 GCC_VERSION=11 +======= + pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.12 + GCC_VERSION=9 + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks) + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.13 + GCC_VERSION=9 + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9) + CUDA_VERSION=12.6.3 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + TRITON=yes + ;; +<<<<<<< HEAD + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9) + CUDA_VERSION=12.8.1 +======= + pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks) + CUDA_VERSION=12.6 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks) + CUDA_VERSION=12.6 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.12 + GCC_VERSION=9 + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks) + CUDA_VERSION=12.6 + CUDNN_VERSION=9 + ANACONDA_PYTHON_VERSION=3.13 + GCC_VERSION=9 + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + TRITON=yes + INDUCTOR_BENCHMARKS=yes ;; pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9) CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=9 VISION=yes @@ -161,17 +261,27 @@ case "$tag" in TRITON=yes ;; pytorch-linux-jammy-py3-clang12-onnx) +<<<<<<< HEAD ANACONDA_PYTHON_VERSION=3.10 +======= + ANACONDA_PYTHON_VERSION=3.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CLANG_VERSION=12 VISION=yes ONNX=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-py3.10-clang12) ANACONDA_PYTHON_VERSION=3.10 +======= + pytorch-linux-jammy-py3.9-clang12) + ANACONDA_PYTHON_VERSION=3.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CLANG_VERSION=12 VISION=yes TRITON=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3) if [[ $tag =~ "jammy" ]]; then ANACONDA_PYTHON_VERSION=3.10 @@ -181,11 +291,31 @@ case "$tag" in GCC_VERSION=11 VISION=yes ROCM_VERSION=7.0 +======= + pytorch-linux-jammy-py3.11-clang12) + ANACONDA_PYTHON_VERSION=3.11 + CLANG_VERSION=12 + VISION=yes + TRITON=yes + ;; + pytorch-linux-jammy-py3.9-gcc9) + ANACONDA_PYTHON_VERSION=3.9 + GCC_VERSION=9 + VISION=yes + TRITON=yes + ;; + pytorch-linux-jammy-rocm-n-1-py3) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=11 + VISION=yes + ROCM_VERSION=6.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) NINJA_VERSION=1.9.0 TRITON=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} +<<<<<<< HEAD PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100" if [[ $tag =~ "benchmarks" ]]; then INDUCTOR_BENCHMARKS=yes @@ -195,10 +325,39 @@ case "$tag" in ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes +======= + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-rocm-n-py3) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=11 + VISION=yes + ROCM_VERSION=6.4 + NINJA_VERSION=1.9.0 + TRITON=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + INDUCTOR_BENCHMARKS=yes + ;; + pytorch-linux-jammy-xpu-2025.0-py3) + ANACONDA_PYTHON_VERSION=3.9 + GCC_VERSION=11 + VISION=yes + XPU_VERSION=2025.0 + NINJA_VERSION=1.9.0 + TRITON=yes + ;; + pytorch-linux-jammy-xpu-2025.1-py3) + ANACONDA_PYTHON_VERSION=3.9 + GCC_VERSION=11 + VISION=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) XPU_VERSION=2025.1 NINJA_VERSION=1.9.0 TRITON=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-xpu-n-py3) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 @@ -209,6 +368,10 @@ case "$tag" in ;; pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.10 +======= + pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks) + ANACONDA_PYTHON_VERSION=3.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GCC_VERSION=11 VISION=yes KATEX=yes @@ -216,20 +379,46 @@ case "$tag" in DOCS=yes INDUCTOR_BENCHMARKS=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12) ANACONDA_PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 +======= + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12) + ANACONDA_PYTHON_VERSION=3.9 + CUDA_VERSION=12.8.1 + CUDNN_VERSION=9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CLANG_VERSION=12 VISION=yes TRITON=yes ;; +<<<<<<< HEAD +======= + pytorch-linux-jammy-py3-clang12-asan) + ANACONDA_PYTHON_VERSION=3.9 + CLANG_VERSION=12 + VISION=yes + TRITON=yes + ;; + pytorch-linux-jammy-py3-clang15-asan) + ANACONDA_PYTHON_VERSION=3.10 + CLANG_VERSION=15 + VISION=yes + ;; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pytorch-linux-jammy-py3-clang18-asan) ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=18 VISION=yes ;; +<<<<<<< HEAD pytorch-linux-jammy-py3.10-gcc11) ANACONDA_PYTHON_VERSION=3.10 +======= + pytorch-linux-jammy-py3.9-gcc11) + ANACONDA_PYTHON_VERSION=3.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GCC_VERSION=11 VISION=yes KATEX=yes @@ -256,10 +445,20 @@ case "$tag" in TRITON_CPU=yes ;; pytorch-linux-jammy-linter) +<<<<<<< HEAD PYTHON_VERSION=3.10 ;; pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) PYTHON_VERSION=3.10 +======= + # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. + # We will need to update mypy version eventually, but that's for another day. The task + # would be to upgrade mypy to 1.0.0 with Python 3.11 + PYTHON_VERSION=3.9 + ;; + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) + PYTHON_VERSION=3.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_VERSION=12.8.1 ;; pytorch-linux-jammy-aarch64-py3.10-gcc11) @@ -267,6 +466,10 @@ case "$tag" in GCC_VERSION=11 ACL=yes VISION=yes +<<<<<<< HEAD +======= + CONDA_CMAKE=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OPENBLAS=yes # snadampal: skipping llvm src build install because the current version # from pytorch/llvm:9.0.1 is x86 specific @@ -277,18 +480,27 @@ case "$tag" in GCC_VERSION=11 ACL=yes VISION=yes +<<<<<<< HEAD +======= + CONDA_CMAKE=yes +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OPENBLAS=yes # snadampal: skipping llvm src build install because the current version # from pytorch/llvm:9.0.1 is x86 specific SKIP_LLVM_SRC_BUILD_INSTALL=yes INDUCTOR_BENCHMARKS=yes ;; +<<<<<<< HEAD pytorch-linux-noble-riscv64-py3.12-gcc14) GCC_VERSION=14 ;; *) # Catch-all for builds that are not hardcoded. PROTOBUF=yes +======= + *) + # Catch-all for builds that are not hardcoded. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VISION=yes echo "image '$image' did not match an existing build configuration" if [[ "$image" == *py* ]]; then @@ -296,6 +508,10 @@ case "$tag" in fi if [[ "$image" == *cuda* ]]; then extract_version_from_image_name cuda CUDA_VERSION +<<<<<<< HEAD +======= + extract_version_from_image_name cudnn CUDNN_VERSION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "$image" == *rocm* ]]; then extract_version_from_image_name rocm ROCM_VERSION @@ -303,7 +519,10 @@ case "$tag" in TRITON=yes # To ensure that any ROCm config will build using conda cmake # and thus have LAPACK/MKL enabled +<<<<<<< HEAD CONDA_CMAKE=yes +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "$image" == *centos7* ]]; then NINJA_VERSION=1.10.2 @@ -320,9 +539,12 @@ case "$tag" in if [[ "$image" == *glibc* ]]; then extract_version_from_image_name glibc GLIBC_VERSION fi +<<<<<<< HEAD if [[ "$image" == *cmake* ]]; then extract_version_from_image_name cmake CMAKE_VERSION fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; esac @@ -336,20 +558,29 @@ if [[ -n "${CI:-}" ]]; then progress_flag="--progress=plain" fi +<<<<<<< HEAD if [[ "${DOCKER_BUILDKIT}" == 0 ]]; then progress_flag="" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Build image docker build \ ${no_cache_flag} \ ${progress_flag} \ --build-arg "BUILD_ENVIRONMENT=${image}" \ +<<<<<<< HEAD --build-arg "PROTOBUF=${PROTOBUF:-}" \ --build-arg "LLVMDEV=${LLVMDEV:-}" \ --build-arg "VISION=${VISION:-}" \ --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \ --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \ +======= + --build-arg "LLVMDEV=${LLVMDEV:-}" \ + --build-arg "VISION=${VISION:-}" \ + --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \ --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ @@ -357,6 +588,7 @@ docker build \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "GCC_VERSION=${GCC_VERSION}" \ --build-arg "CUDA_VERSION=${CUDA_VERSION}" \ +<<<<<<< HEAD --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \ --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ --build-arg "KATEX=${KATEX:-}" \ @@ -366,6 +598,18 @@ docker build \ --build-arg "UCX_COMMIT=${UCX_COMMIT}" \ --build-arg "UCC_COMMIT=${UCC_COMMIT}" \ --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \ +======= + --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \ + --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \ + --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \ + --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ + --build-arg "KATEX=${KATEX:-}" \ + --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ + --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \ + --build-arg "IMAGE_NAME=${IMAGE_NAME}" \ + --build-arg "UCX_COMMIT=${UCX_COMMIT}" \ + --build-arg "UCC_COMMIT=${UCC_COMMIT}" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --build-arg "TRITON=${TRITON}" \ --build-arg "TRITON_CPU=${TRITON_CPU}" \ --build-arg "ONNX=${ONNX}" \ @@ -379,7 +623,10 @@ docker build \ --build-arg "OPENBLAS=${OPENBLAS:-}" \ --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ +<<<<<<< HEAD --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -f $(dirname ${DOCKERFILE})/Dockerfile \ -t "$tmp_tag" \ "$@" \ @@ -420,6 +667,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then fi if [ -n "$GCC_VERSION" ]; then +<<<<<<< HEAD if [[ "$image" == *riscv* ]]; then # Check RISC-V cross-compilation toolchain version if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then @@ -428,6 +676,9 @@ if [ -n "$GCC_VERSION" ]; then exit 1 fi elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then +======= + if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "GCC_VERSION=$GCC_VERSION, but:" drun gcc --version exit 1 @@ -454,9 +705,26 @@ HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || ec if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then if [ "$HAS_TRITON" = "no" ]; then echo "expecting triton to be installed, but it is not" +<<<<<<< HEAD exit 0 fi elif [ "$HAS_TRITON" = "yes" ]; then echo "expecting triton to not be installed, but it is" exit 0 +======= + exit 1 + fi +elif [ "$HAS_TRITON" = "yes" ]; then + echo "expecting triton to not be installed, but it is" + exit 1 +fi + +# Sanity check cmake version. Executorch reinstalls cmake and I'm not sure if +# they support 4.0.0 yet, so exclude them from this check. +CMAKE_VERSION=$(drun cmake --version) +if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then + echo "CMake version is not 4.0.0:" + drun cmake --version + exit 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile index 3d35bb79c7b8d..05d6320cabc05 100644 --- a/.ci/docker/centos-rocm/Dockerfile +++ b/.ci/docker/centos-rocm/Dockerfile @@ -1,7 +1,13 @@ ARG CENTOS_VERSION +<<<<<<< HEAD FROM quay.io/centos/centos:stream${CENTOS_VERSION} +======= +FROM centos:${CENTOS_VERSION} + +ARG CENTOS_VERSION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Set AMD gpu targets to build for ARG PYTORCH_ROCM_ARCH @@ -13,6 +19,7 @@ ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} COPY ./common/install_base.sh install_base.sh RUN bash ./install_base.sh && rm install_base.sh +<<<<<<< HEAD #Install langpack RUN yum install -y glibc-langpack-en @@ -28,6 +35,21 @@ ENV BASH_ENV "/etc/profile" # Install ninja RUN dnf --enablerepo=crb install -y ninja-build +======= +# Update CentOS git version +RUN yum -y remove git +RUN yum -y remove git-* +RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \ + sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo +RUN yum install -y git + +# Install devtoolset +ARG DEVTOOLSET_VERSION +COPY ./common/install_devtoolset.sh install_devtoolset.sh +RUN bash ./install_devtoolset.sh && rm install_devtoolset.sh +ENV BASH_ENV "/etc/profile" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default glibc version ARG GLIBC_VERSION COPY ./common/install_glibc.sh install_glibc.sh @@ -48,7 +70,10 @@ COPY ./common/install_conda.sh install_conda.sh COPY ./common/common_utils.sh common_utils.sh RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install vision packages like OpenCV ARG VISION COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ @@ -58,6 +83,7 @@ ENV INSTALLED_VISION ${VISION} # Install rocm ARG ROCM_VERSION +<<<<<<< HEAD RUN mkdir ci_commit_pins COPY ./common/common_utils.sh common_utils.sh COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt @@ -65,14 +91,22 @@ COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh RUN rm install_rocm.sh common_utils.sh RUN rm -r ci_commit_pins +======= +COPY ./common/install_rocm.sh install_rocm.sh +RUN bash ./install_rocm.sh +RUN rm install_rocm.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) COPY ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN rm install_rocm_magma.sh COPY ./common/install_amdsmi.sh install_amdsmi.sh RUN bash ./install_amdsmi.sh RUN rm install_amdsmi.sh +<<<<<<< HEAD ENV ROCM_PATH /opt/rocm +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV PATH /opt/rocm/bin:$PATH ENV PATH /opt/rocm/hcc/bin:$PATH ENV PATH /opt/rocm/hip/bin:$PATH @@ -82,7 +116,10 @@ ENV MAGMA_HOME /opt/rocm/magma ENV LANG en_US.utf8 ENV LC_ALL en_US.utf8 +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION COPY ./common/install_ninja.sh install_ninja.sh @@ -98,7 +135,11 @@ COPY ./common/install_triton.sh install_triton.sh COPY ./common/common_utils.sh common_utils.sh COPY ci_commit_pins/triton.txt triton.txt COPY triton_version.txt triton_version.txt +<<<<<<< HEAD #RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi +======= +RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt # Install ccache/sccache (do this last, so we get priority in PATH) diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt index f2e2d655a6cf2..f0f16576caf54 100644 --- a/.ci/docker/ci_commit_pins/executorch.txt +++ b/.ci/docker/ci_commit_pins/executorch.txt @@ -1 +1,5 @@ +<<<<<<< HEAD deb42f2a8e48f5032b4a98ee781a15fa87a157cf +======= +56392aa978594cc155fa8af48cd949f5b5f1823a +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ci_commit_pins/huggingface.txt b/.ci/docker/ci_commit_pins/huggingface.txt new file mode 100644 index 0000000000000..f00d6ca4f9ca7 --- /dev/null +++ b/.ci/docker/ci_commit_pins/huggingface.txt @@ -0,0 +1 @@ +243e186efbf7fb93328dd6b34927a4e8c8f24395 diff --git a/.ci/docker/ci_commit_pins/nccl-cu12.txt b/.ci/docker/ci_commit_pins/nccl-cu12.txt index 77a73992346c1..57a4f51b2dd1e 100644 --- a/.ci/docker/ci_commit_pins/nccl-cu12.txt +++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt @@ -1 +1,5 @@ -v2.27.5-1 \ No newline at end of file +<<<<<<< HEAD +v2.27.5-1 +======= +v2.27.3-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt index b03606f6defc1..6abd4a388f1c2 100644 --- a/.ci/docker/ci_commit_pins/triton-xpu.txt +++ b/.ci/docker/ci_commit_pins/triton-xpu.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 1b0418a9a454b2b93ab8d71f40e59d2297157fae +======= +ae324eeac8e102a2b40370e341460f3791353398 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index d893bdd32ab34..03fc672f6eaaf 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1,5 @@ +<<<<<<< HEAD ac80c4190aa0321f761a08af97e1e1eee41f01d9 +======= +21876a4bbaf371bcb83df8e6ee4f43a92f524dfe +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/cache_vision_models.sh b/.ci/docker/common/cache_vision_models.sh index 8380c48177de3..760cbb85cd2a9 100644 --- a/.ci/docker/common/cache_vision_models.sh +++ b/.ci/docker/common/cache_vision_models.sh @@ -2,6 +2,7 @@ set -ex +<<<<<<< HEAD # Skip pytorch-nightly installation in docker images # Installation of pytorch-nightly is needed to prefetch mobilenet_v2 avd v3 models for some tests. # Came from https://github.com/ROCm/pytorch/commit/85bd6bc0105162293fa0bbfb7b661f85ec67f85a @@ -16,6 +17,8 @@ set -ex echo "Skip torch-nightly installation" exit 0 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" # Cache the test models at ~/.cache/torch/hub/ diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh index 0b865e5bc6f8d..eae1f89b5a90e 100755 --- a/.ci/docker/common/install_acl.sh +++ b/.ci/docker/common/install_acl.sh @@ -1,3 +1,4 @@ +<<<<<<< HEAD #!/bin/bash # Script used only in CD pipeline @@ -24,4 +25,22 @@ do sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d} done -rm -rf $ACL_CHECKOUT_DIR \ No newline at end of file +rm -rf $ACL_CHECKOUT_DIR +======= +set -euo pipefail + +readonly version=v25.02 +readonly src_host=https://github.com/ARM-software +readonly src_repo=ComputeLibrary + +# Clone ACL +[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git +cd ${src_repo} + +git checkout $version + +# Build with scons +scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \ + os=linux arch=armv8a build=native multi_isa=1 \ + fixed_format_kernels=1 openmp=1 cppthreads=0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh index a1c98aa25a31a..adeb1b812b984 100755 --- a/.ci/docker/common/install_base.sh +++ b/.ci/docker/common/install_base.sh @@ -77,15 +77,19 @@ install_ubuntu() { # see: https://github.com/pytorch/pytorch/issues/65931 apt-get install -y libgnutls30 +<<<<<<< HEAD if [[ "$UBUNTU_VERSION" == "22.04"* ]]; then apt-get install -y libopenblas-dev fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup package manager apt-get autoclean && apt-get clean rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* } +<<<<<<< HEAD build_libpng() { # install few packages yum install -y zlib zlib-devel @@ -125,6 +129,16 @@ install_centos() { ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt" numpy_deps="gcc-gfortran" yum install -y $ALLOW_ERASE \ +======= +install_centos() { + # Need EPEL for many packages we depend on. + # See http://fedoraproject.org/wiki/EPEL + yum --enablerepo=extras install -y epel-release + + ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt" + numpy_deps="gcc-gfortran" + yum install -y \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) $ccache_deps \ $numpy_deps \ autoconf \ @@ -141,13 +155,20 @@ install_centos() { glibc-headers \ glog-devel \ libstdc++-devel \ +<<<<<<< HEAD + make \ +======= + libsndfile-devel \ make \ + opencv-devel \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sudo \ wget \ vim \ unzip \ gdb +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]] then dnf --enablerepo=crb -y install libsndfile-devel @@ -163,6 +184,8 @@ install_centos() { # Libpng is required for torchvision build. build_libpng +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup yum clean all rm -rf /var/cache/yum @@ -170,10 +193,15 @@ install_centos() { rm -rf /var/lib/yum/history } +<<<<<<< HEAD ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') # Install base packages depending on the base OS +======= +# Install base packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case "$ID" in ubuntu) install_ubuntu diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh index 80839990e4e6f..4b220cba6ed5a 100644 --- a/.ci/docker/common/install_cache.sh +++ b/.ci/docker/common/install_cache.sh @@ -36,12 +36,16 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment export PATH="/opt/cache/bin:$PATH" # Setup compiler cache +<<<<<<< HEAD if [ -n "$ROCM_VERSION" ]; then curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache else install_ubuntu fi +======= +install_ubuntu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) chmod a+x /opt/cache/bin/sccache function write_sccache_stub() { diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh index 68f17b73f10a1..d7da44ec5ec50 100755 --- a/.ci/docker/common/install_conda.sh +++ b/.ci/docker/common/install_conda.sh @@ -24,10 +24,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then source "${SCRIPT_FOLDER}/common_utils.sh" pushd /tmp +<<<<<<< HEAD if [ -n $CENTOS_VERSION ] && [[ $CENTOS_VERSION == 7.* ]]; then NO_CHECK_CERTIFICATE_FLAG="--no-check-certificate" fi wget -q "${BASE_URL}/${CONDA_FILE}" ${NO_CHECK_CERTIFICATE_FLAG} +======= + wget -q "${BASE_URL}/${CONDA_FILE}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 as_jenkins bash "${CONDA_FILE}" -b -f -p "/opt/conda" popd @@ -43,6 +47,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then # Prevent conda from updating to 4.14.0, which causes docker build failures # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d +<<<<<<< HEAD # Uncomment the below when resolved to track the latest conda update, # but this is required for CentOS stream 9 builds ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') @@ -50,6 +55,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then if [[ $ID == centos && $OS_VERSION == 9 ]]; then as_jenkins conda update -y -n base conda fi +======= + # Uncomment the below when resolved to track the latest conda update + # as_jenkins conda update -y -n base conda +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ $(uname -m) == "aarch64" ]]; then export SYSROOT_DEP="sysroot_linux-aarch64=2.17" @@ -73,10 +82,17 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then fi # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README +<<<<<<< HEAD if [[ $(uname -m) != "aarch64" ]]; then pip_install mkl==2024.2.0 pip_install mkl-static==2024.2.0 pip_install mkl-include==2024.2.0 +======= + if [[ $(uname -m) == "aarch64" ]]; then + conda_install "openblas==0.3.29=*openmp*" + else + conda_install "mkl=2021.4.0 mkl-include=2021.4.0" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source @@ -94,6 +110,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then conda_install_through_forge libstdcxx-ng=14 fi +<<<<<<< HEAD # Install required libstdc++.so.6 version if [ "$ANACONDA_PYTHON_VERSION" = "3.10" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.9" ] ; then conda_install_through_forge libstdcxx-ng=12 @@ -103,6 +120,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then conda_install_through_forge libstdcxx-ng=14 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install some other packages, including those needed for Python test reporting pip_install -r /opt/conda/requirements-ci.txt diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh index c873c930097b1..9b0a894a33558 100755 --- a/.ci/docker/common/install_cpython.sh +++ b/.ci/docker/common/install_cpython.sh @@ -3,10 +3,18 @@ set -uex -o pipefail PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python +<<<<<<< HEAD GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py # Python versions to be installed in /opt/$VERSION_NO CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t 3.14.0 3.14.0t"} +======= +PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads # @lint-ignore +GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py + +# Python versions to be installed in /opt/$VERSION_NO +CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) function check_var { if [ -z "$1" ]; then @@ -23,8 +31,14 @@ function do_cpython_build { tar -xzf Python-$py_ver.tgz local additional_flags="" +<<<<<<< HEAD if [[ "$py_ver" == *"t" ]]; then additional_flags=" --disable-gil" +======= + if [ "$py_ver" == "3.13.0t" ]; then + additional_flags=" --disable-gil" + mv cpython-3.13/ cpython-3.13t/ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi pushd $py_folder @@ -66,15 +80,21 @@ function do_cpython_build { ln -s pip3 ${prefix}/bin/pip fi # install setuptools since python 3.12 is required to use distutils +<<<<<<< HEAD # packaging is needed to create symlink since wheel no longer provides needed information ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0 local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))") +======= + ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2 + local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ln -sf ${prefix} /opt/python/${abi_tag} } function build_cpython { local py_ver=$1 check_var $py_ver +<<<<<<< HEAD local py_suffix=$py_ver local py_folder=$py_ver @@ -85,6 +105,26 @@ function build_cpython { fi wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz do_cpython_build $py_ver Python-$py_suffix +======= + check_var $PYTHON_DOWNLOAD_URL + local py_ver_folder=$py_ver + + if [ "$py_ver" = "3.13.0t" ]; then + PY_VER_SHORT="3.13" + PYT_VER_SHORT="3.13t" + check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH + wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz + do_cpython_build $py_ver cpython-$PYT_VER_SHORT + elif [ "$py_ver" = "3.13.0" ]; then + PY_VER_SHORT="3.13" + check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH + wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz + do_cpython_build $py_ver cpython-$PY_VER_SHORT + else + wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz + do_cpython_build $py_ver Python-$py_ver + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) rm -f Python-$py_ver.tgz } diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh index fe2f9ae3185a3..2aa2fd95c165c 100644 --- a/.ci/docker/common/install_cuda.sh +++ b/.ci/docker/common/install_cuda.sh @@ -10,8 +10,11 @@ else arch_path='sbsa' fi +<<<<<<< HEAD NVSHMEM_VERSION=3.4.5 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) function install_cuda { version=$1 runfile=$2 @@ -42,6 +45,7 @@ function install_cudnn { rm -rf tmp_cudnn } +<<<<<<< HEAD function install_nvshmem { cuda_major_version=$1 # e.g. "12" nvshmem_version=$2 # e.g. "3.3.9" @@ -97,12 +101,20 @@ function install_124 { function install_126 { CUDNN_VERSION=9.10.2.21 echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" +======= +function install_126 { + CUDNN_VERSION=9.10.2.21 + echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux install_cudnn 12 $CUDNN_VERSION +<<<<<<< HEAD install_nvshmem 12 $NVSHMEM_VERSION +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_VERSION=12.6 bash install_nccl.sh CUDA_VERSION=12.6 bash install_cusparselt.sh @@ -112,15 +124,22 @@ function install_126 { function install_129 { CUDNN_VERSION=9.10.2.21 +<<<<<<< HEAD echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" +======= + echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # install CUDA 12.9.1 in the same container install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement install_cudnn 12 $CUDNN_VERSION +<<<<<<< HEAD install_nvshmem 12 $NVSHMEM_VERSION +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_VERSION=12.9 bash install_nccl.sh CUDA_VERSION=12.9 bash install_cusparselt.sh @@ -128,17 +147,60 @@ function install_129 { ldconfig } +<<<<<<< HEAD function install_128 { CUDNN_VERSION=9.8.0.87 echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" +======= +function prune_126 { + echo "Pruning CUDA 12.6" + ##################################################################################### + # CUDA 12.6 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64" + + export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then + export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN + fi + + # all CUDA libs except CuDNN and CuBLAS + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 12.6 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-12.6/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/ +} + +function install_128 { + CUDNN_VERSION=9.8.0.87 + echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # install CUDA 12.8.1 in the same container install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement install_cudnn 12 $CUDNN_VERSION +<<<<<<< HEAD install_nvshmem 12 $NVSHMEM_VERSION +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_VERSION=12.8 bash install_nccl.sh CUDA_VERSION=12.8 bash install_cusparselt.sh @@ -146,6 +208,7 @@ function install_128 { ldconfig } +<<<<<<< HEAD function install_130 { CUDNN_VERSION=9.13.0.50 echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" @@ -164,20 +227,29 @@ function install_130 { ldconfig } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # idiomatic parameter and option handling in sh while test $# -gt 0 do case "$1" in +<<<<<<< HEAD 12.4) install_124; ;; 12.6|12.6.*) install_126; +======= + 12.6|12.6.*) install_126; prune_126 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; 12.8|12.8.*) install_128; ;; 12.9|12.9.*) install_129; ;; +<<<<<<< HEAD 13.0|13.0.*) install_130; ;; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) *) echo "bad argument $1"; exit 1 ;; esac diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh new file mode 100644 index 0000000000000..7ee5e73226cb6 --- /dev/null +++ b/.ci/docker/common/install_cudnn.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +if [[ -n "${CUDNN_VERSION}" ]]; then + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn + pushd tmp_cudnn + if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" + elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" + elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive" + else + print "Unsupported CUDA version ${CUDA_VERSION}" + exit 1 + fi + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz + tar xf ${CUDNN_NAME}.tar.xz + cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/ + cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cudnn + ldconfig +fi diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh index b532c086371f1..3443da6482a1e 100644 --- a/.ci/docker/common/install_cusparselt.sh +++ b/.ci/docker/common/install_cusparselt.sh @@ -5,6 +5,7 @@ set -ex # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html mkdir tmp_cusparselt && cd tmp_cusparselt +<<<<<<< HEAD if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then arch_path='sbsa' export TARGETARCH=${TARGETARCH:-$(uname -m)} @@ -14,6 +15,9 @@ if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then +======= +if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) arch_path='sbsa' export TARGETARCH=${TARGETARCH:-$(uname -m)} if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then @@ -21,6 +25,7 @@ elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then fi CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz +<<<<<<< HEAD elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then arch_path='sbsa' export TARGETARCH=${TARGETARCH:-$(uname -m)} @@ -29,6 +34,8 @@ elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then fi CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}" fi diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh index fb168acd4febe..ede624e175562 100755 --- a/.ci/docker/common/install_executorch.sh +++ b/.ci/docker/common/install_executorch.sh @@ -42,13 +42,17 @@ install_pip_dependencies() { # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current # numba and scipy version used in PyTorch CI conda_run pip uninstall -y numba scipy +<<<<<<< HEAD # Yaspin is needed for running CI test (get_benchmark_analysis_data.py) pip_install yaspin==3.1.0 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd } setup_executorch() { +<<<<<<< HEAD export PYTHON_EXECUTABLE=python export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" @@ -66,3 +70,19 @@ if [ $# -eq 0 ]; then else "$@" fi +======= + pushd executorch + + export PYTHON_EXECUTABLE=python + export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + + as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true + popd +} + +clone_executorch +install_buck2 +install_conda_dependencies +install_pip_dependencies +setup_executorch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/install_inductor_benchmark_deps.sh b/.ci/docker/common/install_inductor_benchmark_deps.sh index 81467d87f5140..c8ac925d402ad 100644 --- a/.ci/docker/common/install_inductor_benchmark_deps.sh +++ b/.ci/docker/common/install_inductor_benchmark_deps.sh @@ -5,7 +5,13 @@ set -ex source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" function install_huggingface() { +<<<<<<< HEAD pip_install -r huggingface-requirements.txt +======= + local version + commit=$(get_pinned_commit huggingface) + pip_install "git+https://github.com/huggingface/transformers@${commit}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } function install_timm() { @@ -13,6 +19,7 @@ function install_timm() { commit=$(get_pinned_commit timm) pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}" +<<<<<<< HEAD } function install_torchbench() { @@ -30,10 +37,15 @@ function install_torchbench() { chown -R jenkins torchbench chown -R jenkins /opt/conda +======= + # Clean up + conda_run pip uninstall -y torch torchvision triton +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } # Pango is needed for weasyprint which is needed for doctr conda_install pango +<<<<<<< HEAD # Stable packages are ok here, just to satisfy TorchBench check pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 @@ -44,3 +56,7 @@ install_timm # Clean up conda_run pip uninstall -y torch torchvision torchaudio triton torchao +======= +install_huggingface +install_timm +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh index 58a8e0b4e49c1..ea0cdfc2bf703 100644 --- a/.ci/docker/common/install_nccl.sh +++ b/.ci/docker/common/install_nccl.sh @@ -7,8 +7,11 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt) elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt) +<<<<<<< HEAD elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else echo "Unexpected CUDA_VERSION ${CUDA_VERSION}" exit 1 diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh index 77fd0ff126a43..edccab581ecd6 100755 --- a/.ci/docker/common/install_onnx.sh +++ b/.ci/docker/common/install_onnx.sh @@ -19,8 +19,13 @@ pip_install \ transformers==4.36.2 pip_install coloredlogs packaging +<<<<<<< HEAD pip_install onnxruntime==1.23.1 pip_install onnxscript==0.5.4 +======= +pip_install onnxruntime==1.18.1 +pip_install onnxscript==0.3.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cache the transformers model to be used later by ONNX tests. We need to run the transformers # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh index 2f386c6bd523a..9baed367ba955 100755 --- a/.ci/docker/common/install_openblas.sh +++ b/.ci/docker/common/install_openblas.sh @@ -3,10 +3,15 @@ set -ex +<<<<<<< HEAD OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"} # Clone OpenBLAS git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules +======= +cd / +git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OPENBLAS_CHECKOUT_DIR="OpenBLAS" OPENBLAS_BUILD_FLAGS=" @@ -19,7 +24,12 @@ CFLAGS=-O3 BUILD_BFLOAT16=1 " +<<<<<<< HEAD make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR sudo make install -C $OPENBLAS_CHECKOUT_DIR -rm -rf $OPENBLAS_CHECKOUT_DIR \ No newline at end of file +rm -rf $OPENBLAS_CHECKOUT_DIR +======= +make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR} +make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index afba246cbf0c7..eff17bd922f23 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -2,11 +2,14 @@ set -ex +<<<<<<< HEAD # for pip_install function source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ver() { printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); } @@ -35,6 +38,7 @@ EOF # we want the patch version of 6.4 instead if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then +<<<<<<< HEAD ROCM_VERSION="${ROCM_VERSION}.2" fi @@ -48,6 +52,18 @@ EOF # Add rocm repository wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - +======= + ROCM_VERSION="${ROCM_VERSION}.1" + fi + + # Add amdgpu repository + UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` + echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list + + # Add rocm repository + wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - + local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list apt-get update --allow-insecure-repositories @@ -60,9 +76,25 @@ EOF roctracer-dev \ amd-smi-lib +<<<<<<< HEAD # precompiled miopen kernels is too old and never updated from last 3+yrs so removing the logic to install # Also, these kernels are not generating for MI300X, MI350 and also not reliable anymore +======= + if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev + fi + + # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5 + # search for all unversioned packages + # if search fails it will abort this script; use true to avoid case where search fails + MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true) + if [[ "x${MIOPENHIPGFX}" = x ]]; then + echo "miopen-hip-gfx package not available" && exit 1 + else + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX} + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime for kdb in /opt/rocm/share/miopen/db/*.kdb @@ -71,6 +103,7 @@ EOF done # ROCm 6.3 had a regression where initializing static code objects had significant overhead +<<<<<<< HEAD # CI no longer builds for ROCm 6.3, but # ROCm 6.4 did not yet fix the regression, also HIP branch names are different if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then @@ -98,12 +131,40 @@ EOF cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR make -j cp hipamd/lib/libamdhip64.so.6.4.* /opt/rocm/lib/libamdhip64.so.6.4.* +======= + # ROCm 6.4 did not yet fix the regression, also HIP branch names are different + if [[ $(ver $ROCM_VERSION) -ge $(ver 6.3) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then + if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then + HIP_BRANCH=release/rocm-rel-6.4 + VER_STR=6.4 + VER_PATCH=.1 + elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then + HIP_BRANCH=release/rocm-rel-6.4 + VER_STR=6.4 + elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then + HIP_BRANCH=rocm-6.3.x + VER_STR=6.3 + fi + # clr build needs CppHeaderParser but can only find it using conda's python + /opt/conda/bin/python -m pip install CppHeaderParser + git clone https://github.com/ROCm/HIP -b $HIP_BRANCH + HIP_COMMON_DIR=$(readlink -f HIP) + git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix + mkdir -p clr/build + pushd clr/build + cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR + make -j + cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd rm -rf HIP clr fi +<<<<<<< HEAD pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup apt-get autoclean && apt-get clean rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -114,6 +175,7 @@ install_centos() { yum update -y yum install -y kmod yum install -y wget +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]]; then dnf install -y openblas-serial @@ -150,6 +212,28 @@ install_centos() { else local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}/main" fi +======= + yum install -y openblas-devel + + yum install -y epel-release + yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r` + + # Add amdgpu repository + local amdgpu_baseurl + if [[ $OS_VERSION == 9 ]]; then + amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64" + else + amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64" + fi + echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo + echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo + echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo + echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo + echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo + + local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "[ROCm]" > /etc/yum.repos.d/rocm.repo echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo echo "baseurl=${rocm_baseurl}" >> /etc/yum.repos.d/rocm.repo @@ -157,6 +241,7 @@ install_centos() { echo "gpgcheck=1" >> /etc/yum.repos.d/rocm.repo echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/rocm.repo +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]]; then yum update -y --nogpgcheck dnf --enablerepo=crb install -y perl-File-BaseDir python3-wheel @@ -164,6 +249,11 @@ install_centos() { else yum update -y yum install -y \ +======= + yum update -y + + yum install -y \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) rocm-dev \ rocm-utils \ rocm-libs \ @@ -171,9 +261,21 @@ install_centos() { rocprofiler-dev \ roctracer-dev \ amd-smi-lib +<<<<<<< HEAD fi # precompiled miopen kernels is too old and never updated from last 3+yrs so removing the logic to install # Also, these kernels are not generating for MI300X, MI350 and also not reliable anymore +======= + + # precompiled miopen kernels; search for all unversioned packages + # if search fails it will abort this script; use true to avoid case where search fails + MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true) + if [[ "x${MIOPENHIPGFX}" = x ]]; then + echo "miopen-hip-gfx package not available" && exit 1 + else + yum install -y ${MIOPENHIPGFX} + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime for kdb in /opt/rocm/share/miopen/db/*.kdb @@ -181,8 +283,11 @@ install_centos() { sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" done +<<<<<<< HEAD pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup yum clean all rm -rf /var/cache/yum @@ -190,8 +295,11 @@ install_centos() { rm -rf /var/lib/yum/history } +<<<<<<< HEAD OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install Python packages depending on the base OS ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') case "$ID" in diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index 9bf45e6f1b0a9..9ba07c8e26331 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -12,8 +12,13 @@ function do_install() { rocm_version_nodot=${rocm_version//./} +<<<<<<< HEAD # https://github.com/icl-utk-edu/magma/pull/65 MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec +======= + # Version 2.7.2 + ROCm related updates + MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" rocm_dir="/opt/rocm" diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index b2fdebdcc4747..1a670aaaa9f73 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -57,7 +57,11 @@ if [ ! -f setup.py ]; then cd python fi +<<<<<<< HEAD pip_install pybind11==3.0.1 +======= +pip_install pybind11==2.13.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py @@ -66,15 +70,25 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" # Triton needs at least gcc-9 to build apt-get install -y g++-9 +<<<<<<< HEAD CXX=g++-9 conda_run python -m build --wheel --no-isolation +======= + CXX=g++-9 conda_run python setup.py bdist_wheel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then # Triton needs which surprisingly is not available with clang-9 toolchain add-apt-repository -y ppa:ubuntu-toolchain-r/test apt-get install -y g++-9 +<<<<<<< HEAD CXX=g++-9 conda_run python -m build --wheel --no-isolation else conda_run python -m build --wheel --no-isolation +======= + CXX=g++-9 conda_run python setup.py bdist_wheel +else + conda_run python setup.py bdist_wheel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # Copy the wheel to /opt for multi stage docker builds @@ -98,10 +112,15 @@ fi if [ -n "${NUMPY_VERSION}" ]; then pip_install "numpy==${NUMPY_VERSION}" fi +<<<<<<< HEAD # IMPORTANT: helion needs to be installed without dependencies. # It depends on torch and triton. We don't want to install # triton and torch from production on Docker CI images if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then pip_install helion --no-deps +======= +if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then + pip_install helion +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh index 04f15a52e88e3..10048ebc19efc 100755 --- a/.ci/docker/common/install_ucc.sh +++ b/.ci/docker/common/install_ucc.sh @@ -44,12 +44,17 @@ function install_ucc() { ./autogen.sh +<<<<<<< HEAD if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86" else # We only run distributed tests on Tesla M60 and A10G NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" fi +======= + # We only run distributed tests on Tesla M60 and A10G + NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ -n "$ROCM_VERSION" ]]; then if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then diff --git a/.ci/docker/common/install_vision.sh b/.ci/docker/common/install_vision.sh index 532d8d14a55c8..665b8c0805c65 100755 --- a/.ci/docker/common/install_vision.sh +++ b/.ci/docker/common/install_vision.sh @@ -15,6 +15,7 @@ install_ubuntu() { install_centos() { # Need EPEL for many packages we depend on. # See http://fedoraproject.org/wiki/EPEL +<<<<<<< HEAD if [[ $OS_VERSION == 9 ]]; then yum install -y epel-release else @@ -23,6 +24,12 @@ install_centos() { opencv-devel \ ffmpeg-devel fi +======= + yum --enablerepo=extras install -y epel-release + + yum install -y \ + opencv-devel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleanup yum clean all @@ -31,8 +38,11 @@ install_centos() { rm -rf /var/lib/yum/history } +<<<<<<< HEAD OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install base packages depending on the base OS ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') case "$ID" in diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index 0b150872f93ce..f77c9bb6d2f95 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -34,6 +34,7 @@ function install_ubuntu() { # The xpu-smi packages apt-get install -y flex bison xpu-smi +<<<<<<< HEAD if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then # Compute and Media Runtimes @@ -55,6 +56,20 @@ function install_ubuntu() { apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev fi +======= + # Compute and Media Runtimes + apt-get install -y \ + intel-opencl-icd intel-level-zero-gpu level-zero \ + intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ + libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo + if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then + apt-get install -y intel-ocloc + fi + # Development Packages + apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install Intel Support Packages apt-get install -y ${XPU_PACKAGES} @@ -143,6 +158,7 @@ function install_sles() { } +<<<<<<< HEAD # Default use GPU driver rolling releases XPU_DRIVER_VERSION="" if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then @@ -155,6 +171,20 @@ if [[ "$XPU_VERSION" == "2025.2" ]]; then XPU_PACKAGES="intel-deep-learning-essentials-2025.2" else XPU_PACKAGES="intel-deep-learning-essentials-2025.1" +======= +# Default use GPU driver LTS releases +XPU_DRIVER_VERSION="/lts/2350" +if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then + # Use GPU driver rolling releases + XPU_DRIVER_VERSION="" +fi + +# Default use Intel® oneAPI Deep Learning Essentials 2025.0 +if [[ "$XPU_VERSION" == "2025.1" ]]; then + XPU_PACKAGES="intel-deep-learning-essentials-2025.1" +else + XPU_PACKAGES="intel-deep-learning-essentials-2025.0" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # The installation depends on the base OS diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile index c93f022268b25..1c7781b192289 100644 --- a/.ci/docker/libtorch/Dockerfile +++ b/.ci/docker/libtorch/Dockerfile @@ -69,6 +69,7 @@ RUN bash ./install_cuda.sh 12.9 RUN bash ./install_magma.sh 12.9 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda +<<<<<<< HEAD FROM cuda as cuda13.0 RUN bash ./install_cuda.sh 13.0 RUN bash ./install_magma.sh 13.0 @@ -83,6 +84,10 @@ RUN apt-get update -y && \ cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/ FROM cpu as rocm +======= +FROM cpu as rocm +ARG ROCM_VERSION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ENV MKLROOT /opt/intel @@ -101,7 +106,11 @@ RUN apt-get update -y && \ apt-get clean RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh +<<<<<<< HEAD RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh +======= +RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM ${BASE_TARGET} as final COPY --from=openssl /opt/openssl /opt/openssl diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh index c40896cb5499f..756be70a0a24c 100755 --- a/.ci/docker/libtorch/build.sh +++ b/.ci/docker/libtorch/build.sh @@ -39,6 +39,7 @@ case ${DOCKER_TAG_PREFIX} in DOCKER_GPU_BUILD_ARG="" ;; rocm*) +<<<<<<< HEAD # we want the patch version of 7.0 instead if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" @@ -54,6 +55,11 @@ case ${DOCKER_TAG_PREFIX} in if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" fi +======= + BASE_TARGET=rocm + GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" ;; *) diff --git a/.ci/docker/linter/Dockerfile b/.ci/docker/linter/Dockerfile index 95d08ffea051d..658ad4a91709e 100644 --- a/.ci/docker/linter/Dockerfile +++ b/.ci/docker/linter/Dockerfile @@ -27,7 +27,10 @@ COPY ./common/install_linter.sh install_linter.sh RUN bash ./install_linter.sh RUN rm install_linter.sh +<<<<<<< HEAD RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) USER jenkins CMD ["bash"] diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 index 4803cb778c905..ebbce2f360f93 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28 +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -130,8 +130,12 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \ /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \ done; +<<<<<<< HEAD ADD ./common/patch_libstdc.sh patch_libstdc.sh RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # cmake-3.18.4 from pip; force in case cmake3 already exists RUN yum install -y python3-pip && \ @@ -176,6 +180,10 @@ ENV XPU_DRIVER_TYPE ROLLING RUN python3 -m pip install --upgrade pip && \ python3 -mpip install cmake==3.28.4 ADD ./common/install_xpu.sh install_xpu.sh +<<<<<<< HEAD ENV XPU_VERSION 2025.2 +======= +ENV XPU_VERSION 2025.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RUN bash ./install_xpu.sh && rm install_xpu.sh RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 index 768db09929361..9a45488354c4a 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -62,6 +62,7 @@ ARG OPENBLAS_VERSION ADD ./common/install_openblas.sh install_openblas.sh RUN bash ./install_openblas.sh && rm install_openblas.sh +<<<<<<< HEAD # Install Arm Compute Library FROM base as arm_compute # use python3.9 to install scons @@ -69,6 +70,8 @@ RUN python3.9 -m pip install scons==4.7.0 RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin COPY ./common/install_acl.sh install_acl.sh RUN bash ./install_acl.sh && rm install_acl.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM base as final # remove unnecessary python versions @@ -77,7 +80,11 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/ +<<<<<<< HEAD COPY --from=arm_compute /acl /acl ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH ADD ./common/patch_libstdc.sh patch_libstdc.sh RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh +======= +ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 index 347a01ee4ede7..35aba8e282941 100644 --- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 @@ -86,6 +86,7 @@ FROM base as nvpl ADD ./common/install_nvpl.sh install_nvpl.sh RUN bash ./install_nvpl.sh && rm install_nvpl.sh +<<<<<<< HEAD # Install Arm Compute Library FROM base as arm_compute # use python3.9 to install scons @@ -95,6 +96,8 @@ COPY ./common/install_acl.sh install_acl.sh RUN bash ./install_acl.sh && rm install_acl.sh FROM base as final +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) FROM final as cuda_final ARG BASE_CUDA_VERSION RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} @@ -102,9 +105,14 @@ COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BAS COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} COPY --from=nvpl /opt/nvpl/lib/ /usr/local/lib/ COPY --from=nvpl /opt/nvpl/include/ /usr/local/include/ +<<<<<<< HEAD COPY --from=arm_compute /acl /acl RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda ENV PATH=/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH ADD ./common/patch_libstdc.sh patch_libstdc.sh RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh +======= +RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda +ENV PATH=/usr/local/cuda/bin:$PATH +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/manywheel/Dockerfile_cxx11-abi b/.ci/docker/manywheel/Dockerfile_cxx11-abi new file mode 100644 index 0000000000000..ed33cc61df093 --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi @@ -0,0 +1,71 @@ +FROM centos:8 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +# change to a valid repo +RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo +# enable to install ninja-build +RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo + +RUN yum -y update +RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo +RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++ + + +FROM base as openssl +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + +# Install python +FROM base as python +RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel +ADD common/install_cpython.sh install_cpython.sh +RUN bash ./install_cpython.sh && rm install_cpython.sh + +FROM base as conda +ADD ./common/install_conda_docker.sh install_conda.sh +RUN bash ./install_conda.sh && rm install_conda.sh +RUN /opt/conda/bin/conda install -y cmake + +FROM base as intel +# Install MKL +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=conda /opt/conda /opt/conda +ENV PATH=/opt/conda/bin:$PATH +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM base as patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh +RUN cp $(which patchelf) /patchelf + +FROM base as jni +ADD ./common/install_jni.sh install_jni.sh +ADD ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +FROM base as libpng +ADD ./common/install_libpng.sh install_libpng.sh +RUN bash ./install_libpng.sh && rm install_libpng.sh + +FROM base as final +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=intel /opt/intel /opt/intel +COPY --from=conda /opt/conda /opt/conda +COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf +COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h +COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ +COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ +COPY --from=libpng /usr/local/include/png* /usr/local/include/ +COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ +COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ +COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig + +RUN yum install -y ninja-build diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x index 1cf83acb1c736..0a85278e8b0eb 100644 --- a/.ci/docker/manywheel/Dockerfile_s390x +++ b/.ci/docker/manywheel/Dockerfile_s390x @@ -115,9 +115,12 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio # cmake-3.28.0 from pip for onnxruntime RUN python3 -mpip install cmake==3.28.0 +<<<<<<< HEAD ADD ./common/patch_libstdc.sh patch_libstdc.sh RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # build onnxruntime 1.21.0 from sources. # it is not possible to build it from sources using pip, # so just build it from upstream repository. @@ -134,8 +137,11 @@ RUN pip3 install flatbuffers && \ git clone https://github.com/microsoft/onnxruntime && \ cd onnxruntime && git checkout v1.21.0 && \ git submodule update --init --recursive && \ +<<<<<<< HEAD wget https://github.com/microsoft/onnxruntime/commit/f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \ patch -p1 < f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ./build.sh --config Release --parallel 0 --enable_pybind \ --build_wheel --enable_training --enable_training_apis \ --enable_training_ops --skip_tests --allow_running_as_root \ diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index b4b5059973037..a4fc37c81e1a8 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -28,7 +28,10 @@ fi MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-} DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-} OPENBLAS_VERSION=${OPENBLAS_VERSION:-} +<<<<<<< HEAD ACL_VERSION=${ACL_VERSION:-} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case ${image} in manylinux2_28-builder:cpu) @@ -42,6 +45,16 @@ case ${image} in GPU_IMAGE=arm64v8/almalinux:8 DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1" MANY_LINUX_VERSION="2_28_aarch64" +<<<<<<< HEAD +======= + OPENBLAS_VERSION="v0.3.30" + ;; + manylinuxcxx11-abi-builder:cpu-cxx11-abi) + TARGET=final + GPU_IMAGE="" + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" + MANY_LINUX_VERSION="cxx11-abi" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ;; manylinuxs390x-builder:cpu-s390x) TARGET=final @@ -61,12 +74,15 @@ case ${image} in DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" MANY_LINUX_VERSION="2_28" ;; +<<<<<<< HEAD manylinux2_28-builder:cuda13*) TARGET=cuda_final GPU_IMAGE=amd64/almalinux:8 DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" MANY_LINUX_VERSION="2_28" ;; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manylinuxaarch64-builder:cuda*) TARGET=cuda_final GPU_IMAGE=amd64/almalinux:8 @@ -75,6 +91,7 @@ case ${image} in DOCKERFILE_SUFFIX="_cuda_aarch64" ;; manylinux2_28-builder:rocm*) +<<<<<<< HEAD # we want the patch version of 7.0 instead if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" @@ -83,15 +100,20 @@ case ${image} in if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TARGET=rocm_final MANY_LINUX_VERSION="2_28" DEVTOOLSET_VERSION="11" GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +<<<<<<< HEAD # add gfx950, gfx115x conditionally starting in ROCm 7.0 if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" ;; manylinux2_28-builder:xpu) @@ -123,8 +145,12 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') DOCKER_BUILDKIT=1 docker build \ ${DOCKER_GPU_BUILD_ARG} \ --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ +<<<<<<< HEAD --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \ --build-arg "ACL_VERSION=${ACL_VERSION:-}" \ +======= + --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --target "${TARGET}" \ -t "${tmp_tag}" \ $@ \ diff --git a/.ci/docker/manywheel/build_scripts/ssl-check.py b/.ci/docker/manywheel/build_scripts/ssl-check.py index c4df0eacbb7fd..7f6f92f12c84b 100644 --- a/.ci/docker/manywheel/build_scripts/ssl-check.py +++ b/.ci/docker/manywheel/build_scripts/ssl-check.py @@ -10,6 +10,14 @@ print("Testing SSL certificate checking for Python:", sys.version) +<<<<<<< HEAD +======= +if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4): + print("This version never checks SSL certs; skipping tests") + sys.exit(0) + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXC = OSError print(f"Connecting to {GOOD_SSL} should work") diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 658d2d34a6474..677883fdcfd3d 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -10,11 +10,14 @@ boto3==1.35.42 #Pinned versions: 1.19.12, 1.16.34 #test that import: +<<<<<<< HEAD build==1.3.0 #Description: A simple, correct Python build frontend. #Pinned versions: 1.3.0 #test that import: +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) click #Description: Command Line Interface Creation Kit #Pinned versions: @@ -52,10 +55,17 @@ flatbuffers==24.12.23 #Pinned versions: 24.12.23 #test that import: +<<<<<<< HEAD hypothesis==6.56.4 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 #Description: advanced library for generating parametrized tests #Pinned versions: 6.56.4 +======= +hypothesis==5.35.1 +# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 +#Description: advanced library for generating parametrized tests +#Pinned versions: 3.44.6, 4.53.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py junitparser==2.1.1 @@ -68,12 +78,20 @@ lark==0.12.0 #Pinned versions: 0.12.0 #test that import: +<<<<<<< HEAD librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x" librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Description: A python package for music and audio analysis #Pinned versions: >=0.6.2 #test that import: test_spectral_ops.py #librosa depends on numba; disable it for s390x while numba is disabled too +======= +librosa>=0.6.2 ; python_version < "3.11" +librosa==0.10.2 ; python_version == "3.12" +#Description: A python package for music and audio analysis +#Pinned versions: >=0.6.2 +#test that import: test_spectral_ops.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #mkl #this breaks linux-bionic-rocm4.5-py3.7 #Description: Intel oneAPI Math Kernel Library @@ -98,9 +116,14 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Pinned versions: #test that import: +<<<<<<< HEAD mypy==1.16.0 ; platform_system == "Linux" # Pin MyPy version because new errors are likely to appear with each release # Skip on Windows as lots of type annotations are POSIX specific +======= +mypy==1.16.0 +# Pin MyPy version because new errors are likely to appear with each release +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: linter #Pinned versions: 1.16.0 #test that import: test_typing.py, test_type_hints.py @@ -111,18 +134,31 @@ networkx==2.8.8 #Pinned versions: 2.8.8 #test that import: functorch +<<<<<<< HEAD ninja==1.11.1.4 #Description: build system. Used in some tests. Used in build to generate build #time tracing information #Pinned versions: 1.11.1.4 +======= +ninja==1.11.1.3 +#Description: build system. Used in some tests. Used in build to generate build +#time tracing information +#Pinned versions: 1.11.1.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py numba==0.60.0 ; python_version == "3.9" numba==0.61.2 ; python_version > "3.9" #Description: Just-In-Time Compiler for Numerical Functions +<<<<<<< HEAD #Pinned versions: 0.55.2, 0.60.0 #test that import: test_numba_integration.py #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073 +======= +#Pinned versions: 0.54.1, 0.49.0, <=0.49.1 +#test that import: test_numba_integration.py +#For numba issue see https://github.com/pytorch/pytorch/issues/51511 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #numpy #Description: Provides N-dimensional arrays and linear algebra @@ -166,6 +202,7 @@ optree==0.13.0 pillow==11.0.0 #Description: Python Imaging Library fork +<<<<<<< HEAD #Pinned versions: 11.0.0 #test that import: @@ -174,6 +211,16 @@ protobuf==5.29.5 #Pinned versions: 5.29.5 #test that import: test_tensorboard.py, test/onnx/* +======= +#Pinned versions: 10.3.0 +#test that import: + +protobuf==3.20.2 ; python_version <= "3.12" +protobuf==4.25.1 ; python_version == "3.13" +#Description: Google’s data interchange format +#Pinned versions: 3.20.1 +#test that import: test_tensorboard.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) psutil #Description: information on running processes and system utilization @@ -215,7 +262,11 @@ pytest-subtests==0.13.1 #Pinned versions: #test that import: +<<<<<<< HEAD xdoctest==1.3.0 +======= +xdoctest==1.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: runs doctests in pytest #Pinned versions: 1.1.0 #test that import: @@ -225,9 +276,15 @@ pygments==2.15.0 #Pinned versions: 2.12.0 #test that import: the doctests +<<<<<<< HEAD #pyyaml #Description: data serialization format #Pinned versions: 6.0.2 +======= +#PyYAML +#Description: data serialization format +#Pinned versions: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: #requests @@ -237,12 +294,22 @@ pygments==2.15.0 #rich #Description: rich text and beautiful formatting in the terminal +<<<<<<< HEAD #Pinned versions: 14.1.0 #test that import: scikit-image==0.22.0 #Description: image processing routines #Pinned versions: 0.22.0 +======= +#Pinned versions: 10.9.0 +#test that import: + +scikit-image==0.19.3 ; python_version < "3.10" +scikit-image==0.22.0 ; python_version >= "3.10" +#Description: image processing routines +#Pinned versions: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #test that import: test_nn.py #scikit-learn @@ -265,7 +332,11 @@ scipy==1.14.1 ; python_version > "3.9" #test that import: # needed by torchgen utils +<<<<<<< HEAD typing-extensions==4.12.2 +======= +typing-extensions>=4.10.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: type hints for python #Pinned versions: #test that import: @@ -305,7 +376,11 @@ pytest-cpp==2.3.0 #Pinned versions: 2.3.0 #test that import: +<<<<<<< HEAD z3-solver==4.15.1.0 ; platform_machine != "s390x" +======= +z3-solver==4.12.6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: The Z3 Theorem Prover Project #Pinned versions: #test that import: @@ -326,6 +401,11 @@ lxml==5.3.0 ; python_version <= "3.12" lxml==6.0.0 ; python_version == "3.13" #Description: This is a requirement of unittest-xml-reporting +<<<<<<< HEAD +======= +# Python-3.9 binaries + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PyGithub==2.3.0 sympy==1.13.3 @@ -333,6 +413,7 @@ sympy==1.13.3 #Pinned versions: #test that import: +<<<<<<< HEAD onnx==1.19.1 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal @@ -340,6 +421,15 @@ onnx==1.19.1 #test that import: onnxscript==0.5.4 +======= +onnx==1.16.1 ; python_version <= "3.12" +onnx==1.18.0 ; python_version == "3.13" +#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal +#Pinned versions: +#test that import: + +onnxscript==0.3.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal #Pinned versions: #test that import: @@ -358,12 +448,21 @@ pwlf==2.2.1 #Pinned versions: 2.2.1 #test that import: test_sac_estimator.py +<<<<<<< HEAD # To build PyTorch itself pyyaml==6.0.2 pyzstd setuptools==78.1.1 packaging==23.1 six +======= + +# To build PyTorch itself +astunparse +PyYAML +pyzstd +setuptools +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scons==4.5.2 ; platform_machine == "aarch64" @@ -377,6 +476,7 @@ dataclasses_json==0.6.7 #Pinned versions: 0.6.7 #test that import: +<<<<<<< HEAD cmake==3.31.6 #Description: required for building @@ -395,3 +495,14 @@ scikit-build==0.18.1 pyre-extensions==0.0.32 tabulate==0.9.0 #Description: These package are needed to build FBGEMM and torchrec on PyTorch CI +======= +cmake==4.0.0 +#Description: required for building + +tlparse==0.3.30 +#Description: required for log parsing + +cuda-bindings>=12.0,<13.0 +#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits. +#test that import: test_cuda.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index 6e623b4c56949..d98390bf0cd7b 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,6 +1,7 @@ sphinx==5.3.0 #Description: This is used to generate PyTorch docs #Pinned versions: 5.3.0 +<<<<<<< HEAD standard-imghdr==3.13.0; python_version >= "3.13" #Description: This is needed by Sphinx, so it needs to be added here. @@ -10,6 +11,10 @@ standard-imghdr==3.13.0; python_version >= "3.13" # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency. -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2 +======= +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2 + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably # something related to Docker setup. We can investigate this later. @@ -26,10 +31,16 @@ sphinx_sitemap==2.6.0 #Description: This is used to generate sitemap for PyTorch docs #Pinned versions: 2.6.0 +<<<<<<< HEAD matplotlib==3.5.3 ; python_version < "3.13" matplotlib==3.6.3 ; python_version >= "3.13" #Description: This is used to generate PyTorch docs #Pinned versions: 3.6.3 if python > 3.12. Otherwise 3.5.3. +======= +matplotlib==3.5.3 +#Description: This is used to generate PyTorch docs +#Pinned versions: 3.5.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tensorboard==2.13.0 ; python_version < "3.13" tensorboard==2.18.0 ; python_version >= "3.13" @@ -57,8 +68,13 @@ IPython==8.12.0 #Pinned versions: 8.12.0 myst-nb==0.17.2 +<<<<<<< HEAD #Description: This is used to generate PyTorch functorch and torch.compile docs. #Pinned versions: 0.17.2 +======= +#Description: This is used to generate PyTorch functorch docs +#Pinned versions: 0.13.2 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs python-etcd==0.4.5 diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt index 1545d966571dc..561eb4a3cc51e 100644 --- a/.ci/docker/triton_version.txt +++ b/.ci/docker/triton_version.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 3.5.0 +======= +3.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt index 1545d966571dc..561eb4a3cc51e 100644 --- a/.ci/docker/triton_xpu_version.txt +++ b/.ci/docker/triton_xpu_version.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 3.5.0 +======= +3.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index b517a990a057b..52ace3ff45137 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -52,6 +52,7 @@ ENV INSTALLED_VISION ${VISION} # Install rocm ARG ROCM_VERSION +<<<<<<< HEAD RUN mkdir ci_commit_pins COPY ./common/common_utils.sh common_utils.sh COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt @@ -59,6 +60,11 @@ COPY ./common/install_rocm.sh install_rocm.sh RUN bash ./install_rocm.sh RUN rm install_rocm.sh common_utils.sh RUN rm -r ci_commit_pins +======= +COPY ./common/install_rocm.sh install_rocm.sh +RUN bash ./install_rocm.sh +RUN rm install_rocm.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) COPY ./common/install_rocm_magma.sh install_rocm_magma.sh RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN rm install_rocm_magma.sh @@ -100,11 +106,18 @@ ARG ANACONDA_PYTHON_VERSION ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh +<<<<<<< HEAD COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt COPY ci_commit_pins/torchbench.txt torchbench.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt +======= +COPY ci_commit_pins/huggingface.txt huggingface.txt +COPY ci_commit_pins/timm.txt timm.txt +RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # (optional) Install non-default Ninja version ARG NINJA_VERSION diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile index 8765249688ce5..8ab05c37b9ec5 100644 --- a/.ci/docker/ubuntu-xpu/Dockerfile +++ b/.ci/docker/ubuntu-xpu/Dockerfile @@ -56,10 +56,17 @@ RUN rm install_openssl.sh ARG INDUCTOR_BENCHMARKS COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh +<<<<<<< HEAD COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt +======= +COPY ci_commit_pins/huggingface.txt huggingface.txt +COPY ci_commit_pins/timm.txt timm.txt +RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install XPU Dependencies ARG XPU_VERSION diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 84a74114c381e..150a585bac0c1 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -66,7 +66,10 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/" # (optional) Install UCC ARG UCX_COMMIT ARG UCC_COMMIT +<<<<<<< HEAD ARG CUDA_VERSION +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV UCX_COMMIT $UCX_COMMIT ENV UCC_COMMIT $UCC_COMMIT ENV UCX_HOME /usr @@ -97,6 +100,7 @@ RUN rm install_openssl.sh ARG INDUCTOR_BENCHMARKS COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh +<<<<<<< HEAD COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt COPY ci_commit_pins/torchbench.txt torchbench.txt @@ -109,6 +113,12 @@ ARG INSTALL_MINGW COPY ./common/install_mingw.sh install_mingw.sh RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi RUN rm install_mingw.sh +======= +COPY ci_commit_pins/huggingface.txt huggingface.txt +COPY ci_commit_pins/timm.txt timm.txt +RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG TRITON ARG TRITON_CPU @@ -189,6 +199,10 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi # AWS specific CUDA build guidance +<<<<<<< HEAD +======= +ENV TORCH_CUDA_ARCH_LIST Maxwell +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all" ENV CUDA_PATH /usr/local/cuda diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh index c2d67f8b1bb29..7c668ca81e714 100644 --- a/.ci/libtorch/build.sh +++ b/.ci/libtorch/build.sh @@ -7,4 +7,8 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +<<<<<<< HEAD USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh +======= +USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile index 9fca7ad544617..f7d21fcc579bc 100644 --- a/.ci/magma-rocm/Makefile +++ b/.ci/magma-rocm/Makefile @@ -1,11 +1,19 @@ SHELL=/usr/bin/env bash DOCKER_CMD ?= docker +<<<<<<< HEAD DESIRED_ROCM ?= 7.0 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) PACKAGE_NAME = magma-rocm # inherit this from underlying docker image, do not pass this env var to docker #PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201 +======= +DESIRED_ROCM ?= 6.4 +DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) +PACKAGE_NAME = magma-rocm +# inherit this from underlying docker image, do not pass this env var to docker +#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ -v $(shell git rev-parse --show-toplevel)/.ci:/builder \ @@ -16,20 +24,36 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ magma-rocm/build_magma.sh .PHONY: all +<<<<<<< HEAD all: magma-rocm70 all: magma-rocm64 +======= +all: magma-rocm64 +all: magma-rocm63 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .PHONY: clean: $(RM) -r magma-* $(RM) -r output +<<<<<<< HEAD .PHONY: magma-rocm70 magma-rocm70: DESIRED_ROCM := 7.0 magma-rocm70: $(DOCKER_RUN) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .PHONY: magma-rocm64 magma-rocm64: DESIRED_ROCM := 6.4 magma-rocm64: $(DOCKER_RUN) +<<<<<<< HEAD +======= + +.PHONY: magma-rocm63 +magma-rocm63: DESIRED_ROCM := 6.3 +magma-rocm63: + $(DOCKER_RUN) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh index c7c7780227ea5..0b435f5f337ef 100755 --- a/.ci/magma-rocm/build_magma.sh +++ b/.ci/magma-rocm/build_magma.sh @@ -6,8 +6,13 @@ set -eou pipefail # The script expects DESIRED_CUDA and PACKAGE_NAME to be set ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +<<<<<<< HEAD # https://github.com/icl-utk-edu/magma/pull/65 MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec +======= +# Version 2.7.2 + ROCm related updates +MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Folders for the build PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata @@ -20,7 +25,11 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE # Fetch magma sources and verify checksum pushd ${PACKAGE_DIR} +<<<<<<< HEAD git clone https://github.com/jeffdaily/magma +======= +git clone https://bitbucket.org/icl/magma.git +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pushd magma git checkout ${MAGMA_VERSION} popd diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile index 4169aedd03fa5..233925d95eb67 100644 --- a/.ci/magma/Makefile +++ b/.ci/magma/Makefile @@ -16,7 +16,10 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ magma/build_magma.sh .PHONY: all +<<<<<<< HEAD all: magma-cuda130 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) all: magma-cuda129 all: magma-cuda128 all: magma-cuda126 @@ -26,12 +29,15 @@ clean: $(RM) -r magma-* $(RM) -r output +<<<<<<< HEAD .PHONY: magma-cuda130 magma-cuda130: DESIRED_CUDA := 13.0 magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 magma-cuda130: $(DOCKER_RUN) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .PHONY: magma-cuda129 magma-cuda129: DESIRED_CUDA := 12.9 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 diff --git a/.ci/magma/build_magma.sh b/.ci/magma/build_magma.sh index 6f1924fa45965..c88109ab01765 100755 --- a/.ci/magma/build_magma.sh +++ b/.ci/magma/build_magma.sh @@ -28,7 +28,10 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION} patch < ${PACKAGE_FILES}/CMake.patch patch < ${PACKAGE_FILES}/cmakelists.patch patch -p0 < ${PACKAGE_FILES}/thread_queue.patch +<<<<<<< HEAD patch -p1 < ${PACKAGE_FILES}/cuda13.patch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch # The build.sh script expects to be executed from the sources root folder @@ -38,7 +41,10 @@ popd # Package recipe, license and tarball # Folder and package name are backward compatible for the build workflow cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh +<<<<<<< HEAD cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch diff --git a/.ci/manywheel/build.sh b/.ci/manywheel/build.sh index 6b2a60bc5ca28..82339921b69dd 100755 --- a/.ci/manywheel/build.sh +++ b/.ci/manywheel/build.sh @@ -5,6 +5,13 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" case "${GPU_ARCH_TYPE:-BLANK}" in +<<<<<<< HEAD +======= + BLANK) + # Legacy behavior for CircleCI + bash "${SCRIPTPATH}/build_cuda.sh" + ;; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda) bash "${SCRIPTPATH}/build_cuda.sh" ;; diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh index b84268fd12896..2d8624f59b072 100644 --- a/.ci/manywheel/build_common.sh +++ b/.ci/manywheel/build_common.sh @@ -97,7 +97,11 @@ if [[ -z "$PYTORCH_ROOT" ]]; then exit 1 fi pushd "$PYTORCH_ROOT" +<<<<<<< HEAD retry pip install -qUr requirements-build.txt +======= +retry pip install -q cmake +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python setup.py clean retry pip install -qr requirements.txt case ${DESIRED_PYTHON} in @@ -138,11 +142,36 @@ fi echo "Calling setup.py bdist at $(date)" +<<<<<<< HEAD time CMAKE_ARGS=${CMAKE_ARGS[@]} \ EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR +======= +if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" + time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ + BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \ + BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ + USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ + python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR + echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" + echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" + time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ + BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \ + BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ + USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ + CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR + echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" +else + time CMAKE_ARGS=${CMAKE_ARGS[@]} \ + EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ + BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ + USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ + python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Finished setup.py bdist at $(date)" # Build libtorch packages @@ -255,6 +284,13 @@ ls /tmp/$WHEELHOUSE_DIR mkdir -p "/$WHEELHOUSE_DIR" mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/ +<<<<<<< HEAD +======= +if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true +fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ -n "$BUILD_PYTHONLESS" ]]; then mkdir -p /$LIBTORCH_HOUSE_DIR mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR @@ -431,8 +467,21 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then pushd $PYTORCH_ROOT/test # Install the wheel for this Python version +<<<<<<< HEAD + pip uninstall -y "$TORCH_PACKAGE_NAME" + +======= + if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true + fi + pip uninstall -y "$TORCH_PACKAGE_NAME" + if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v # Print info on the libraries installed in this wheel diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 2a822295e0361..e7263ae1da951 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -66,9 +66,12 @@ case ${CUDA_VERSION} in TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX" fi ;; +<<<<<<< HEAD 13.0) TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" ;; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0" ;; @@ -113,6 +116,7 @@ DEPS_SONAME=( ) +<<<<<<< HEAD # CUDA_VERSION 12.*, 13.* if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then export USE_STATIC_CUDNN=0 @@ -125,6 +129,15 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." +======= +# CUDA_VERSION 12.6, 12.8, 12.9 +if [[ $CUDA_VERSION == 12* ]]; then + export USE_STATIC_CUDNN=0 + # Try parallelizing nvcc as well + export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" + if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then + echo "Bundling with cudnn and cublas." +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DEPS_LIST+=( "/usr/local/cuda/lib64/libcudnn_adv.so.9" "/usr/local/cuda/lib64/libcudnn_cnn.so.9" @@ -134,12 +147,23 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9" "/usr/local/cuda/lib64/libcudnn_heuristic.so.9" "/usr/local/cuda/lib64/libcudnn.so.9" +<<<<<<< HEAD "/usr/local/cuda/lib64/libcusparseLt.so.0" "/usr/local/cuda/lib64/libnvrtc-builtins.so" "/usr/local/cuda/lib64/libcufile.so.0" "/usr/local/cuda/lib64/libcufile_rdma.so.1" "/usr/local/cuda/lib64/libnvshmem_host.so.3" "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so" +======= + "/usr/local/cuda/lib64/libcublas.so.12" + "/usr/local/cuda/lib64/libcublasLt.so.12" + "/usr/local/cuda/lib64/libcusparseLt.so.0" + "/usr/local/cuda/lib64/libcudart.so.12" + "/usr/local/cuda/lib64/libnvrtc.so.12" + "/usr/local/cuda/lib64/libnvrtc-builtins.so" + "/usr/local/cuda/lib64/libcufile.so.0" + "/usr/local/cuda/lib64/libcufile_rdma.so.1" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) DEPS_SONAME+=( "libcudnn_adv.so.9" @@ -150,6 +174,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then "libcudnn_engines_precompiled.so.9" "libcudnn_heuristic.so.9" "libcudnn.so.9" +<<<<<<< HEAD "libcusparseLt.so.0" "libnvrtc-builtins.so" "libnvshmem_host.so.3" @@ -230,6 +255,35 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then ) fi +======= + "libcublas.so.12" + "libcublasLt.so.12" + "libcusparseLt.so.0" + "libcudart.so.12" + "libnvrtc.so.12" + "libnvrtc-builtins.so" + "libcufile.so.0" + "libcufile_rdma.so.1" + ) + else + echo "Using nvidia libs from pypi." + CUDA_RPATHS=( + '$ORIGIN/../../nvidia/cublas/lib' + '$ORIGIN/../../nvidia/cuda_cupti/lib' + '$ORIGIN/../../nvidia/cuda_nvrtc/lib' + '$ORIGIN/../../nvidia/cuda_runtime/lib' + '$ORIGIN/../../nvidia/cudnn/lib' + '$ORIGIN/../../nvidia/cufft/lib' + '$ORIGIN/../../nvidia/curand/lib' + '$ORIGIN/../../nvidia/cusolver/lib' + '$ORIGIN/../../nvidia/cusparse/lib' + '$ORIGIN/../../nvidia/cusparselt/lib' + '$ORIGIN/../../cusparselt/lib' + '$ORIGIN/../../nvidia/nccl/lib' + '$ORIGIN/../../nvidia/nvtx/lib' + '$ORIGIN/../../nvidia/cufile/lib' + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}") export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib' export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN' diff --git a/.ci/manywheel/build_libtorch.sh b/.ci/manywheel/build_libtorch.sh index d78fbd5c3ed36..dc9bf200bbcb8 100644 --- a/.ci/manywheel/build_libtorch.sh +++ b/.ci/manywheel/build_libtorch.sh @@ -92,7 +92,11 @@ if [[ -z "$PYTORCH_ROOT" ]]; then exit 1 fi pushd "$PYTORCH_ROOT" +<<<<<<< HEAD retry pip install -qUr requirements-build.txt +======= +retry pip install -q cmake +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python setup.py clean retry pip install -qr requirements.txt retry pip install -q numpy==2.0.1 @@ -104,7 +108,11 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr fi +<<<<<<< HEAD echo "Calling -m pip install . -v --no-build-isolation at $(date)" +======= +echo "Calling setup.py install at $(date)" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ $LIBTORCH_VARIANT = *"static"* ]]; then STATIC_CMAKE_FLAG="-DTORCH_STATIC=1" @@ -120,7 +128,11 @@ fi # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed CFLAGS='-Wno-deprecated-declarations' \ BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \ +<<<<<<< HEAD python -m pip install --no-build-isolation -v . +======= + python setup.py install +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mkdir -p libtorch/{lib,bin,include,share} diff --git a/.ci/manywheel/build_rocm.sh b/.ci/manywheel/build_rocm.sh index bac56746f4501..1f97907e35040 100755 --- a/.ci/manywheel/build_rocm.sh +++ b/.ci/manywheel/build_rocm.sh @@ -107,10 +107,13 @@ if [[ $ROCM_INT -ge 60200 ]]; then ROCM_SO_FILES+=("librocm-core.so") fi +<<<<<<< HEAD if [[ $ROCM_INT -ge 70000 ]]; then ROCM_SO_FILES+=("librocroller.so") fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" @@ -198,7 +201,11 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library ROCBLAS_LIB_DST=lib/rocblas/library ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH) ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx) +<<<<<<< HEAD ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES) +======= +ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # hipblaslt library files HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library diff --git a/.ci/manywheel/build_xpu.sh b/.ci/manywheel/build_xpu.sh index bd7b168be336c..034ef7cf08fc9 100755 --- a/.ci/manywheel/build_xpu.sh +++ b/.ci/manywheel/build_xpu.sh @@ -25,7 +25,10 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh export USE_STATIC_MKL=1 export USE_ONEMKL=1 export USE_XCCL=1 +<<<<<<< HEAD export USE_MPI=0 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) WHEELHOUSE_DIR="wheelhousexpu" LIBTORCH_HOUSE_DIR="libtorch_housexpu" diff --git a/.ci/pytorch/build-mobile.sh b/.ci/pytorch/build-mobile.sh new file mode 100755 index 0000000000000..1f253ff58c03d --- /dev/null +++ b/.ci/pytorch/build-mobile.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables +set -eu -o pipefail + +# This script uses linux host toolchain + mobile build options in order to +# build & test mobile libtorch without having to setup Android/iOS +# toolchain/simulator. + +# shellcheck source=./common.sh +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" +# shellcheck source=./common-build.sh +source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" + +# Install torch & torchvision - used to download & trace test model. +# Ideally we should use the libtorch built on the PR so that backward +# incompatible changes won't break this script - but it will significantly slow +# down mobile CI jobs. +# Here we install nightly instead of stable so that we have an option to +# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly. +retry pip install --pre torch torchvision \ + -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \ + --progress-bar off + +# Run end-to-end process of building mobile library, linking into the predictor +# binary, and running forward pass with a real model. +if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then + TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh +elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then + test/mobile/lightweight_dispatch/build.sh +else + TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh +fi + +print_sccache_stats diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index cae81a2568d5c..d1db011006ebe 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -11,6 +11,13 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" # shellcheck source=./common-build.sh source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" +<<<<<<< HEAD +======= +if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then + exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@" +fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Python version:" python --version @@ -50,6 +57,12 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then export ATEN_THREADING=NATIVE fi +<<<<<<< HEAD +======= +# Enable LLVM dependency for TensorExpr testing +export USE_LLVM=/opt/llvm +export LLVM_DIR=/opt/llvm/lib/cmake/llvm +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ! which conda; then # In ROCm CIs, we are doing cross compilation on build machines with @@ -89,6 +102,7 @@ fi if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then export USE_MKLDNN=1 export USE_MKLDNN_ACL=1 +<<<<<<< HEAD export ACL_ROOT_DIR=/acl fi @@ -111,6 +125,9 @@ if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then export SLEEF_TARGET_EXEC_USE_QEMU=ON sudo chown -R jenkins /var/lib/jenkins/workspace /opt +======= + export ACL_ROOT_DIR=/ComputeLibrary +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then @@ -138,8 +155,31 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then fi # Use special scripts for Android builds +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then +======= +if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then + export ANDROID_NDK=/opt/ndk + build_args=() + if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then + build_args+=("-DANDROID_ABI=armeabi-v7a") + elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then + build_args+=("-DANDROID_ABI=arm64-v8a") + elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then + build_args+=("-DANDROID_ABI=x86") + elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then + build_args+=("-DANDROID_ABI=x86_64") + fi + if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then + build_args+=("-DUSE_VULKAN=ON") + fi + build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF") + exec ./scripts/build_android.sh "${build_args[@]}" "$@" +fi + +if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) export USE_VULKAN=1 # shellcheck disable=SC1091 source /var/lib/jenkins/vulkansdk/setup-env.sh @@ -173,7 +213,10 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then source /opt/intel/oneapi/mpi/latest/env/vars.sh # Enable XCCL build export USE_XCCL=1 +<<<<<<< HEAD export USE_MPI=0 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA export USE_KINETO=0 export TORCH_XPU_ARCH_LIST=pvc @@ -195,6 +238,7 @@ fi # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of # memory to build and will OOM +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then J=2 # default to 2 jobs @@ -205,6 +249,12 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' esac echo "Building FlashAttention with job limit $J" export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}" +======= +if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then + echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM" + echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage" + export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then @@ -219,6 +269,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then export USE_ASAN=1 export REL_WITH_DEB_INFO=1 export UBSAN_FLAGS="-fno-sanitize-recover=all" +<<<<<<< HEAD +======= + unset USE_LLVM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then @@ -229,6 +283,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then export USE_PRECOMPILED_HEADERS=1 fi +<<<<<<< HEAD if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then export BUILD_STATIC_RUNTIME_BENCHMARK=ON fi @@ -236,12 +291,23 @@ fi if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then export CMAKE_BUILD_TYPE=Debug elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then +======= +if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then + export BUILD_STATIC_RUNTIME_BENCHMARK=ON +fi + +if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) export CMAKE_BUILD_TYPE=RelWithAssert fi # Do not change workspace permissions for ROCm and s390x CI jobs # as it can leave workspace with bad permissions for cancelled jobs +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then +======= +if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") cleanup_workspace() { @@ -286,18 +352,32 @@ else # XLA test build fails when WERROR=1 # set only when building other architectures # or building non-XLA tests. +<<<<<<< HEAD if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then +======= + if [[ "$BUILD_ENVIRONMENT" != *rocm* && + "$BUILD_ENVIRONMENT" != *xla* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Install numpy-2.0.2 for builds which are backward compatible with 1.X python -mpip install numpy==2.0.2 WERROR=1 python setup.py clean +<<<<<<< HEAD WERROR=1 python -m build --wheel --no-isolation +======= + if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + python3 tools/packaging/split_wheel.py bdist_wheel + else + WERROR=1 python setup.py bdist_wheel + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else python setup.py clean if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then source .ci/pytorch/install_cache_xla.sh fi +<<<<<<< HEAD python -m build --wheel --no-isolation fi pip_install_whl "$(echo dist/*.whl)" @@ -322,6 +402,16 @@ else if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *torchao* ]]; then install_torchao fi +======= + if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + echo "USE_SPLIT_BUILD cannot be used with xla or rocm" + exit 1 + else + python setup.py bdist_wheel + fi + fi + pip_install_whl "$(echo dist/*.whl)" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then echo "Checking that xpu is compiled" @@ -410,8 +500,15 @@ else # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has # 16 CPUs +<<<<<<< HEAD MAX_JOBS=$(nproc --ignore=4) export MAX_JOBS +======= + if [ -z "$MAX_JOBS_OVERRIDE" ]; then + MAX_JOBS=$(nproc --ignore=4) + export MAX_JOBS + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: Install outside of source directory (at the same level as the root # pytorch folder) so that it doesn't get cleaned away prior to docker push. @@ -428,7 +525,12 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build python tools/stats/export_test_times.py fi +<<<<<<< HEAD # don't do this for bazel or s390x or riscv64 as they don't use sccache if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then +======= +# don't do this for bazel or s390x as they don't use sccache +if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) print_sccache_stats fi diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh index 0f632f8006c07..83af7493bc821 100755 --- a/.ci/pytorch/check_binary.sh +++ b/.ci/pytorch/check_binary.sh @@ -300,3 +300,27 @@ except RuntimeError as e: exit 1 fi fi +<<<<<<< HEAD +======= + +############################################################################### +# Check for C++ ABI compatibility to GCC-11 - GCC 13 +############################################################################### +if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then + pushd /tmp + # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html + # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19 + # gcc 11 - CUDA 11.8, xpu, rocm + # gcc 13 - CUDA 12.6, 12.8 and cpu + # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426 + if [[ "$(uname -m)" == "s390x" ]]; then + cxx_abi="19" + elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then + cxx_abi="18" + else + cxx_abi="16" + fi + python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)" + popd +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/common-build.sh b/.ci/pytorch/common-build.sh index 8ca9fdb34c77a..23dca9287491e 100644 --- a/.ci/pytorch/common-build.sh +++ b/.ci/pytorch/common-build.sh @@ -13,6 +13,7 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then fi if which sccache > /dev/null; then +<<<<<<< HEAD # Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise # sccache will complain about invalid bucket configuration if [[ -z "${SCCACHE_BUCKET:-}" ]]; then @@ -20,6 +21,8 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then unset SCCACHE_REGION fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Save sccache logs to file sccache --stop-server > /dev/null 2>&1 || true rm -f ~/sccache_error.log || true diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 2325c3d4ed4e7..a9dc698e939b0 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -67,17 +67,26 @@ function pip_install_whl() { # Loop through each path and install individually for path in "${paths[@]}"; do echo "Installing $path" +<<<<<<< HEAD python3 -mpip install "$path" +======= + python3 -mpip install --no-index --no-deps "$path" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) done else # Loop through each argument and install individually for path in "${args[@]}"; do echo "Installing $path" +<<<<<<< HEAD python3 -mpip install "$path" +======= + python3 -mpip install --no-index --no-deps "$path" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) done fi } +<<<<<<< HEAD function pip_build_and_install() { local build_target=$1 local wheel_dir=$2 @@ -105,6 +114,8 @@ function pip_build_and_install() { pip_install_whl "${file}" done } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) function pip_install() { # retry 3 times @@ -148,6 +159,7 @@ function get_pinned_commit() { cat .github/ci_commit_pins/"${1}".txt } +<<<<<<< HEAD function detect_cuda_arch() { if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then if command -v nvidia-smi; then @@ -165,6 +177,19 @@ function install_torchaudio() { local commit commit=$(get_pinned_commit audio) pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio +======= +function install_torchaudio() { + local commit + commit=$(get_pinned_commit audio) + if [[ "$1" == "cuda" ]]; then + # TODO: This is better to be passed as a parameter from _linux-test workflow + # so that it can be consistent with what is set in build + TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}" + else + pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}" + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } function install_torchtext() { @@ -172,8 +197,13 @@ function install_torchtext() { local text_commit data_commit=$(get_pinned_commit data) text_commit=$(get_pinned_commit text) +<<<<<<< HEAD pip_build_and_install "git+https://github.com/pytorch/data.git@${data_commit}" dist/data pip_build_and_install "git+https://github.com/pytorch/text.git@${text_commit}" dist/text +======= + pip_install --no-use-pep517 "git+https://github.com/pytorch/data.git@${data_commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/text.git@${text_commit}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } function install_torchvision() { @@ -186,6 +216,7 @@ function install_torchvision() { echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c - LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so fi +<<<<<<< HEAD if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then # Not sure if both are needed, but why not @@ -194,6 +225,9 @@ function install_torchvision() { fi pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision +======= + pip_install --no-use-pep517 "git+https://github.com/pytorch/vision.git@${commit}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [ -n "${LD_PRELOAD}" ]; then LD_PRELOAD=${orig_preload} fi @@ -213,6 +247,7 @@ function install_torchrec_and_fbgemm() { if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm +<<<<<<< HEAD pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec pip_uninstall fbgemm-gpu-nightly @@ -286,12 +321,37 @@ EOF else pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu +======= + pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" + pip_uninstall fbgemm-gpu-nightly + + pip_install tabulate # needed for newer fbgemm + pip_install patchelf # needed for rocm fbgemm + git clone --recursive https://github.com/pytorch/fbgemm + pushd fbgemm/fbgemm_gpu + git checkout "${fbgemm_commit}" + python setup.py install \ + --package_variant=rocm \ + -DHIP_ROOT_DIR="${ROCM_PATH}" \ + -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ + -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" + popd + rm -rf fbgemm + else + # See https://github.com/pytorch/pytorch/issues/106971 + CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu" + pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi } function clone_pytorch_xla() { if [[ ! -d ./xla ]]; then +<<<<<<< HEAD git clone --recursive --quiet https://github.com/pytorch/xla.git +======= + git clone --recursive -b r2.8 https://github.com/pytorch/xla.git +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pushd xla # pin the xla hash so that we don't get broken by changes to xla git checkout "$(cat ../.github/ci_commit_pins/xla.txt)" @@ -301,10 +361,41 @@ function clone_pytorch_xla() { fi } +<<<<<<< HEAD function install_torchao() { local commit commit=$(get_pinned_commit torchao) pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao +======= +function checkout_install_torchbench() { + local commit + commit=$(get_pinned_commit torchbench) + git clone https://github.com/pytorch/benchmark torchbench + pushd torchbench + git checkout "$commit" + + if [ "$1" ]; then + python install.py --continue_on_fail models "$@" + else + # Occasionally the installation may fail on one model but it is ok to continue + # to install and test other models + python install.py --continue_on_fail + fi + + # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488 + # is regressing speedup metric. This needs to be investigated further + pip install transformers==4.38.1 + + echo "Print all dependencies after TorchBench is installed" + python -mpip freeze + popd +} + +function install_torchao() { + local commit + commit=$(get_pinned_commit torchao) + pip_install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${commit}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } function print_sccache_stats() { diff --git a/.ci/pytorch/cpp_doc_push_script.sh b/.ci/pytorch/cpp_doc_push_script.sh index f085fa78bebe9..536966a992503 100755 --- a/.ci/pytorch/cpp_doc_push_script.sh +++ b/.ci/pytorch/cpp_doc_push_script.sh @@ -58,7 +58,11 @@ time python tools/setup_helpers/generate_code.py \ # Build the docs pushd docs/cpp +<<<<<<< HEAD time make VERBOSE=1 html +======= +time make VERBOSE=1 html -j +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd popd diff --git a/.ci/pytorch/create_test_cert.py b/.ci/pytorch/create_test_cert.py new file mode 100644 index 0000000000000..f2be0c13227d1 --- /dev/null +++ b/.ci/pytorch/create_test_cert.py @@ -0,0 +1,123 @@ +from datetime import datetime, timedelta, timezone +from tempfile import mkdtemp + +from cryptography import x509 +from cryptography.hazmat.primitives import hashes, serialization +from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography.x509.oid import NameOID + + +temp_dir = mkdtemp() +print(temp_dir) + + +def genrsa(path): + key = rsa.generate_private_key( + public_exponent=65537, + key_size=2048, + ) + with open(path, "wb") as f: + f.write( + key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption(), + ) + ) + return key + + +def create_cert(path, C, ST, L, O, key): + subject = issuer = x509.Name( + [ + x509.NameAttribute(NameOID.COUNTRY_NAME, C), + x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST), + x509.NameAttribute(NameOID.LOCALITY_NAME, L), + x509.NameAttribute(NameOID.ORGANIZATION_NAME, O), + ] + ) + cert = ( + x509.CertificateBuilder() + .subject_name(subject) + .issuer_name(issuer) + .public_key(key.public_key()) + .serial_number(x509.random_serial_number()) + .not_valid_before(datetime.now(timezone.utc)) + .not_valid_after( + # Our certificate will be valid for 10 days + datetime.now(timezone.utc) + timedelta(days=10) + ) + .add_extension( + x509.BasicConstraints(ca=True, path_length=None), + critical=True, + ) + .sign(key, hashes.SHA256()) + ) + # Write our certificate out to disk. + with open(path, "wb") as f: + f.write(cert.public_bytes(serialization.Encoding.PEM)) + return cert + + +def create_req(path, C, ST, L, O, key): + csr = ( + x509.CertificateSigningRequestBuilder() + .subject_name( + x509.Name( + [ + # Provide various details about who we are. + x509.NameAttribute(NameOID.COUNTRY_NAME, C), + x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST), + x509.NameAttribute(NameOID.LOCALITY_NAME, L), + x509.NameAttribute(NameOID.ORGANIZATION_NAME, O), + ] + ) + ) + .sign(key, hashes.SHA256()) + ) + with open(path, "wb") as f: + f.write(csr.public_bytes(serialization.Encoding.PEM)) + return csr + + +def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key): + cert = ( + x509.CertificateBuilder() + .subject_name(csr_cert.subject) + .issuer_name(ca_cert.subject) + .public_key(csr_cert.public_key()) + .serial_number(x509.random_serial_number()) + .not_valid_before(datetime.now(timezone.utc)) + .not_valid_after( + # Our certificate will be valid for 10 days + datetime.now(timezone.utc) + timedelta(days=10) + # Sign our certificate with our private key + ) + .sign(private_ca_key, hashes.SHA256()) + ) + with open(path, "wb") as f: + f.write(cert.public_bytes(serialization.Encoding.PEM)) + return cert + + +ca_key = genrsa(temp_dir + "/ca.key") +ca_cert = create_cert( + temp_dir + "/ca.pem", + "US", + "New York", + "New York", + "Gloo Certificate Authority", + ca_key, +) + +pkey = genrsa(temp_dir + "/pkey.key") +csr = create_req( + temp_dir + "/csr.csr", + "US", + "California", + "San Francisco", + "Gloo Testing Company", + pkey, +) + +cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key) diff --git a/.ci/pytorch/functorch_doc_push_script.sh b/.ci/pytorch/functorch_doc_push_script.sh new file mode 100755 index 0000000000000..85c70dffa3966 --- /dev/null +++ b/.ci/pytorch/functorch_doc_push_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# This is where the local pytorch install in the docker image is located +pt_checkout="/var/lib/jenkins/workspace" +source "$pt_checkout/.ci/pytorch/common_utils.sh" +echo "functorch_doc_push_script.sh: Invoked with $*" + +set -ex -o pipefail + +version=${DOCS_VERSION:-nightly} +echo "version: $version" + +# Build functorch docs +pushd $pt_checkout/functorch/docs +make html +popd + +git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages +pushd functorch_ghpages + +if [ "$version" == "main" ]; then + version=nightly +fi + +git rm -rf "$version" || true +mv "$pt_checkout/functorch/docs/build/html" "$version" + +git add "$version" || true +git status +git config user.email "soumith+bot@pytorch.org" +git config user.name "pytorchbot" +# If there aren't changes, don't make a commit; push is no-op +git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true +git status + +if [[ "${WITH_PUSH:-}" == true ]]; then + git push -u origin gh-pages +fi + +popd diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh index c01efda11ea6f..4c15dd5b1f9f9 100755 --- a/.ci/pytorch/macos-build.sh +++ b/.ci/pytorch/macos-build.sh @@ -36,11 +36,19 @@ fi print_cmake_info if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls +<<<<<<< HEAD USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation else # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64 +======= + USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel +else + # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests + # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 + USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if which sccache > /dev/null; then print_sccache_stats diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 2687852a2c4f3..7ec63ae2dd185 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -55,7 +55,11 @@ test_python_shard() { setup_test_python +<<<<<<< HEAD time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS" +======= + time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } @@ -157,6 +161,7 @@ test_jit_hooks() { assert_git_not_dirty } +<<<<<<< HEAD # Shellcheck doesn't like it when you pass no arguments to a function # that can take args. See https://www.shellcheck.net/wiki/SC2120 # shellcheck disable=SC2120 @@ -185,6 +190,8 @@ checkout_install_torchbench() { python -mpip freeze } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torchbench_setup_macos() { git clone --recursive https://github.com/pytorch/vision torchvision git clone --recursive https://github.com/pytorch/audio torchaudio @@ -195,7 +202,11 @@ torchbench_setup_macos() { git checkout "$(cat ../.github/ci_commit_pins/vision.txt)" git submodule update --init --recursive python setup.py clean +<<<<<<< HEAD python -m pip install -e . -v --no-build-isolation +======= + python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd pushd torchaudio @@ -204,14 +215,26 @@ torchbench_setup_macos() { git submodule update --init --recursive python setup.py clean #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp +<<<<<<< HEAD USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation popd +======= + USE_OPENMP=0 python setup.py develop + popd + + # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120 + # shellcheck disable=SC2119,SC2120 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) checkout_install_torchbench } pip_benchmark_deps() { +<<<<<<< HEAD python -mpip install --no-input requests cython scikit-learn six +======= + python -mpip install --no-input astunparse requests cython scikit-learn +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } @@ -256,7 +279,11 @@ test_torchbench_smoketest() { local device=mps local dtypes=(undefined float16 bfloat16 notset) local dtype=${dtypes[$1]} +<<<<<<< HEAD local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16) +======= + local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for backend in eager inductor; do @@ -302,6 +329,7 @@ test_torchbench_smoketest() { fi done +<<<<<<< HEAD echo "Pytorch benchmark on mps device completed" } @@ -343,6 +371,8 @@ test_aoti_torchbench_smoketest() { PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Pytorch benchmark on mps device completed" } @@ -391,8 +421,11 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then test_timm_perf elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then test_torchbench_smoketest "${SHARD_NUMBER}" +<<<<<<< HEAD elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then test_aoti_torchbench_smoketest "${SHARD_NUMBER}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ $TEST_CONFIG == *"mps"* ]]; then test_python_mps elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh index 039459816724f..4ab4c038eadf9 100755 --- a/.ci/pytorch/multigpu-test.sh +++ b/.ci/pytorch/multigpu-test.sh @@ -26,7 +26,10 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering +<<<<<<< HEAD time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) time python test/run_test.py --verbose -i distributed/test_store time python test/run_test.py --verbose -i distributed/test_symmetric_memory time python test/run_test.py --verbose -i distributed/test_pg_wrapper @@ -46,7 +49,10 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then # DTensor tests time python test/run_test.py --verbose -i distributed/tensor/test_random_ops time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile +<<<<<<< HEAD time python test/run_test.py --verbose -i distributed/tensor/test_utils.py +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # DeviceMesh test time python test/run_test.py --verbose -i distributed/test_device_mesh diff --git a/.ci/pytorch/run_glootls_test.sh b/.ci/pytorch/run_glootls_test.sh new file mode 100755 index 0000000000000..cd17b269fe6a9 --- /dev/null +++ b/.ci/pytorch/run_glootls_test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py" +TMP_CERT_DIR=$(python "$CREATE_TEST_CERT") + +openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem" + +export GLOO_DEVICE_TRANSPORT=TCP_TLS +export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key +export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem +export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem + +time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest + +unset GLOO_DEVICE_TRANSPORT +unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY +unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT +unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE diff --git a/.ci/pytorch/run_tests.sh b/.ci/pytorch/run_tests.sh index f5ed90deef249..97ae8d22c7917 100755 --- a/.ci/pytorch/run_tests.sh +++ b/.ci/pytorch/run_tests.sh @@ -74,6 +74,7 @@ else fi # Environment initialization +<<<<<<< HEAD retry pip install -qUr requirements-build.txt if [[ "$(uname)" == Darwin ]]; then # Install the testing dependencies @@ -81,6 +82,14 @@ if [[ "$(uname)" == Darwin ]]; then else retry pip install -qr requirements.txt || true retry pip install -q hypothesis protobuf pytest || true +======= +if [[ "$(uname)" == Darwin ]]; then + # Install the testing dependencies + retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml +else + retry pip install -qr requirements.txt || true + retry pip install -q hypothesis protobuf pytest setuptools || true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) numpy_ver=1.15 case "$(python --version 2>&1)" in *2* | *3.5* | *3.6*) diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py index b0c607659c72d..1dd56236e2619 100755 --- a/.ci/pytorch/smoke_test/check_binary_symbols.py +++ b/.ci/pytorch/smoke_test/check_binary_symbols.py @@ -32,9 +32,12 @@ "torch::", ) +<<<<<<< HEAD # Patterns for detecting statically linked libstdc++ symbols STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def _apply_libtorch_symbols(symbols): return [ @@ -56,17 +59,24 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]: return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]] +<<<<<<< HEAD def grep_symbols( lib: str, patterns: list[Any], symbol_type: str | None = None ) -> list[str]: +======= +def grep_symbols(lib: str, patterns: list[Any]) -> list[str]: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def _grep_symbols( symbols: list[tuple[str, str, str]], patterns: list[Any] ) -> list[str]: rc = [] for _s_addr, _s_type, s_name in symbols: +<<<<<<< HEAD # Filter by symbol type if specified if symbol_type and _s_type != symbol_type: continue +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for pattern in patterns: if pattern.match(s_name): rc.append(s_name) @@ -88,6 +98,7 @@ def _get_symbols_chunk(i): return functools.reduce(list.__add__, (x.result() for x in tasks), []) +<<<<<<< HEAD def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None: cxx11_statically_linked_symbols = grep_symbols( lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T" @@ -100,6 +111,8 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None: ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def check_lib_symbols_for_abi_correctness(lib: str) -> None: print(f"lib: {lib}") cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS) @@ -127,7 +140,10 @@ def main() -> None: libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so") check_lib_symbols_for_abi_correctness(libtorch_cpu_path) +<<<<<<< HEAD check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if __name__ == "__main__": diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py index 675d58a3e283d..7840686f13171 100644 --- a/.ci/pytorch/smoke_test/smoke_test.py +++ b/.ci/pytorch/smoke_test/smoke_test.py @@ -385,6 +385,7 @@ def foo(x: torch.Tensor) -> torch.Tensor: x_pt2 = torch.compile(model, mode="max-autotune")(x) +<<<<<<< HEAD def smoke_test_nvshmem() -> None: if not torch.cuda.is_available() or target_os == "windows": print("Windows platform or CUDA is not available, skipping NVSHMEM test") @@ -410,6 +411,8 @@ def smoke_test_nvshmem() -> None: print(f"NVSHMEM available at run time: {_is_nvshmem_available()}") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def smoke_test_modules(): cwd = os.getcwd() for module in MODULES: @@ -504,8 +507,11 @@ def main() -> None: options.pypi_pkg_check, ) +<<<<<<< HEAD smoke_test_nvshmem() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if __name__ == "__main__": main() diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 40dc90f2eb24f..90546839a2bd8 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -11,8 +11,11 @@ export TERM=vt100 # shellcheck source=./common.sh source "$(dirname "${BASH_SOURCE[0]}")/common.sh" +<<<<<<< HEAD # shellcheck source=./common-build.sh source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Do not change workspace permissions for ROCm and s390x CI jobs # as it can leave workspace with bad permissions for cancelled jobs @@ -32,6 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v git config --global --add safe.directory /var/lib/jenkins/workspace fi +<<<<<<< HEAD # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then @@ -44,6 +48,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Environment variables:" env @@ -103,7 +109,10 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then export VALGRIND=OFF fi +<<<<<<< HEAD detect_cuda_arch +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then # There are additional warnings on s390x, maybe due to newer gcc. @@ -178,6 +187,11 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" # setting PYTHON_TEST_EXTRA_OPTION export PYTHON_TEST_EXTRA_OPTION="--xpu" +<<<<<<< HEAD +======= + # Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585 + sudo rm -rf /opt/cache +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ "$TEST_CONFIG" == *crossref* ]]; then @@ -302,12 +316,15 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then export ATEN_CPU_CAPABILITY=avx2 fi +<<<<<<< HEAD if [[ "${TEST_CONFIG}" == "legacy_nvidia_driver" ]]; then # Make sure that CUDA can be initialized (cd test && python -c "import torch; torch.rand(2, 2, device='cuda')") export USE_LEGACY_DRIVER=1 fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_python_legacy_jit() { time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose assert_git_not_dirty @@ -324,18 +341,27 @@ test_python_shard() { # modify LD_LIBRARY_PATH to ensure it has the conda env. # This set of tests has been shown to be buggy without it for the split-build +<<<<<<< HEAD time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running +======= + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } test_python() { # shellcheck disable=SC2086 +<<<<<<< HEAD time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION +======= + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } test_python_smoke() { +<<<<<<< HEAD # Smoke tests for H100/B200 time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running assert_git_not_dirty @@ -344,6 +370,10 @@ test_python_smoke() { test_python_smoke_b200() { # Targeted smoke tests for B200 - staged approach to avoid too many failures time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running +======= + # Smoke tests for H100 + time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } @@ -352,6 +382,7 @@ test_h100_distributed() { time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running # This test requires multicast support time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running +<<<<<<< HEAD assert_git_not_dirty } @@ -370,6 +401,14 @@ test_h100_cutlass_backend() { TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running } +======= + # symmetric memory test + time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + assert_git_not_dirty +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_lazy_tensor_meta_reference_disabled() { export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1 echo "Testing lazy tensor operations without meta reference" @@ -392,7 +431,10 @@ test_dynamo_wrapped_shard() { --exclude-distributed-tests \ --exclude-torch-export-tests \ --exclude-aot-dispatch-tests \ +<<<<<<< HEAD --exclude-quantization-tests \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --shard "$1" "$NUM_TEST_SHARDS" \ --verbose \ --upload-artifacts-while-running @@ -413,10 +455,16 @@ test_einops() { test_inductor_distributed() { # Smuggle a few multi-gpu tests here so that we don't have to request another large node echo "Testing multi_gpu tests in test_torchinductor" +<<<<<<< HEAD python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose python test/run_test.py -i inductor/test_aot_inductor.py -k test_on_gpu_device1 --verbose python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_gpu_device --verbose python test/run_test.py -i inductor/test_aot_inductor.py -k test_load_package_multiple_gpus --verbose +======= + python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose + python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose + python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose python test/run_test.py -i distributed/tensor/test_dtensor_compile.py --verbose python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose @@ -437,7 +485,11 @@ test_inductor_distributed() { # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported # with if required # gpus aren't available +<<<<<<< HEAD python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose +======= + python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } @@ -460,12 +512,19 @@ test_inductor_shard() { --verbose } +<<<<<<< HEAD test_inductor_aoti_cpp() { +======= +test_inductor_aoti() { + # docker build uses bdist_wheel which does not work with test_aot_inductor + # TODO: need a faster way to build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then # We need to hipify before building again python3 tools/amd_build/build_amd.py fi if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then +<<<<<<< HEAD # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}") else @@ -489,6 +548,16 @@ test_inductor_aoti_cross_compile_for_windows() { ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib" +======= + BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop + # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB + LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH + CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile + else + BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop + CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } test_inductor_cpp_wrapper_shard() { @@ -501,6 +570,7 @@ test_inductor_cpp_wrapper_shard() { TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" +<<<<<<< HEAD # Run certain inductor unit tests with cpp wrapper. In the end state, we # should be able to run all the inductor unit tests with cpp_wrapper. # @@ -528,6 +598,48 @@ test_inductor_cpp_wrapper_shard() { -k 'xpu' \ --shard "$1" "$NUM_TEST_SHARDS" \ --verbose +======= + if [[ "$1" -eq "2" ]]; then + # For now, manually put the opinfo tests in shard 2, and all other tests in + # shard 1. Run all CPU tests, as well as specific GPU tests triggering past + # bugs, for now. + python test/run_test.py \ + --include inductor/test_torchinductor_opinfo \ + -k 'linalg or to_sparse or TestInductorOpInfoCPU' \ + --verbose + exit + fi + + # Run certain inductor unit tests with cpp wrapper. In the end state, we + # should be able to run all the inductor unit tests with cpp_wrapper. + python test/run_test.py \ + --include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \ + --verbose + python test/run_test.py --inductor --include test_torch -k 'take' --verbose + + # Run inductor benchmark tests with cpp wrapper. + # Skip benchmark tests if it's in rerun-disabled-mode. + if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then + echo "skip dynamo benchmark tests for rerun-disabled-test" + else + echo "run dynamo benchmark tests with cpp wrapper" + python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \ + --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \ + --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" + python benchmarks/dynamo/check_accuracy.py \ + --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \ + --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_timm_training.csv" + + python benchmarks/dynamo/torchbench.py --device cuda --accuracy \ + --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" + python benchmarks/dynamo/torchbench.py --device cuda --accuracy \ + --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" + python benchmarks/dynamo/torchbench.py --device cuda --accuracy \ + --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" + python benchmarks/dynamo/check_accuracy.py \ + --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \ + --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_torchbench_inference.csv" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi } @@ -649,8 +761,13 @@ test_perf_for_dashboard() { local device=cuda if [[ "${TEST_CONFIG}" == *cpu* ]]; then +<<<<<<< HEAD if [[ "${TEST_CONFIG}" == *cpu_x86_zen* ]]; then device=cpu_x86_zen +======= + if [[ "${TEST_CONFIG}" == *zen_cpu_x86* ]]; then + device=zen_cpu_x86 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then device=cpu_x86 elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then @@ -661,19 +778,26 @@ test_perf_for_dashboard() { device=cuda_a10g elif [[ "${TEST_CONFIG}" == *h100* ]]; then device=cuda_h100 +<<<<<<< HEAD elif [[ "${TEST_CONFIG}" == *b200* ]]; then device=cuda_b200 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *rocm* ]]; then device=rocm fi for mode in "${modes[@]}"; do if [[ "$mode" == "inference" ]]; then +<<<<<<< HEAD if [[ "$device" == "cpu_x86" ]]; then dtype=amp else dtype=bfloat16 fi +======= + dtype=bfloat16 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "$mode" == "training" ]]; then dtype=amp fi @@ -685,10 +809,13 @@ test_perf_for_dashboard() { target_flag+=( --no-translation-validation) fi +<<<<<<< HEAD if [[ "$DASHBOARD_TAG" == *freezing-true* ]]; then target_flag+=( --freezing) fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$DASHBOARD_TAG" == *default-true* ]]; then $TASKSET python "benchmarks/dynamo/$suite.py" \ "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \ @@ -837,6 +964,7 @@ test_dynamo_benchmark() { if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@" elif [[ "${TEST_CONFIG}" == *perf* ]]; then +<<<<<<< HEAD # TODO (huydhn): Just smoke test some sample models if [[ "${TEST_CONFIG}" == *b200* ]]; then if [[ "${suite}" == "huggingface" ]]; then @@ -847,6 +975,8 @@ test_dynamo_benchmark() { export TORCHBENCH_ONLY_MODELS="BERT_pytorch" fi fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@" else if [[ "${TEST_CONFIG}" == *cpu* ]]; then @@ -875,13 +1005,21 @@ test_inductor_torchbench_smoketest_perf() { mkdir -p "$TEST_REPORTS_DIR" python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \ +<<<<<<< HEAD --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \ +======= + --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" # The threshold value needs to be actively maintained to make this check useful python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4 # Check memory compression ratio for a few models +<<<<<<< HEAD for test in BERT_pytorch yolov3; do +======= + for test in hf_Albert timm_vision_transformer; do +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \ --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \ --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" @@ -892,7 +1030,11 @@ test_inductor_torchbench_smoketest_perf() { done # Perform some "warm-start" runs for a few huggingface models. +<<<<<<< HEAD for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do +======= + for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \ --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" python benchmarks/dynamo/check_accuracy.py \ @@ -906,7 +1048,11 @@ test_inductor_set_cpu_affinity(){ export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD" export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" +<<<<<<< HEAD if [[ "$(uname -m)" != "aarch64" ]]; then +======= + if [[ "${TEST_CONFIG}" != *aarch64* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Use Intel OpenMP for x86 IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so" export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD" @@ -920,7 +1066,11 @@ test_inductor_set_cpu_affinity(){ cores=$((cpus / thread_per_core)) # Set number of cores to 16 on aarch64 for performance runs +<<<<<<< HEAD if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then +======= + if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cores=16 fi export OMP_NUM_THREADS=$cores @@ -974,6 +1124,15 @@ test_torchbench_gcp_smoketest(){ popd } +<<<<<<< HEAD +======= +test_python_gloo_with_tls() { + source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh" + assert_git_not_dirty +} + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_aten() { # Test ATen # The following test(s) of ATen have already been skipped by caffe2 in rocm environment: @@ -1020,8 +1179,11 @@ test_without_numpy() { if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')" fi +<<<<<<< HEAD # Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy) python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) popd } @@ -1085,10 +1247,26 @@ test_libtorch_api() { mkdir -p $TEST_REPORTS_DIR OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml +<<<<<<< HEAD +======= + "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest" +<<<<<<< HEAD +======= + # On s390x, pytorch is built without llvm. + # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and + # test fails with errors like: + # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer + # unknown file: Failure + # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) } + if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then + python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi # quantization is not fully supported on s390x yet @@ -1171,12 +1349,15 @@ test_distributed() { fi } +<<<<<<< HEAD test_quantization() { echo "Testing quantization" python test/test_quantization.py } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_rpc() { echo "Testing RPC C++ tests" # NB: the ending test_rpc must match the current function name for the current @@ -1362,6 +1543,7 @@ EOF # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing # file is modified to introduce an invalid public API function. +<<<<<<< HEAD # The filepath here must not have __all__ defined in it, otherwise the test will pass. # If your PR introduces __all__ to torch/cuda/streams.py please point this to another file # that does not have __all__ defined. @@ -1369,6 +1551,12 @@ EOF cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig" echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}" invalid_api="torch.cuda.streams.new_public_func" +======= + EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py" + cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig" + echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}" + invalid_api="torch.nn.parameter.new_public_func" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..." check_public_api_test_fails \ @@ -1423,7 +1611,11 @@ EOF pip3 install -r requirements.txt # shellcheck source=./common-build.sh source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" +<<<<<<< HEAD python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist" +======= + python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python -mpip install base_dist/*.whl echo "::endgroup::" @@ -1571,10 +1763,21 @@ test_executorch() { install_torchvision install_torchaudio +<<<<<<< HEAD INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh" pushd /executorch "${INSTALL_SCRIPT}" setup_executorch +======= + pushd /executorch + + export PYTHON_EXECUTABLE=python + export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + + # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch + # from the PR + bash .ci/scripts/setup-linux.sh --build-tool cmake +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Run ExecuTorch unit tests" pytest -v -n auto @@ -1588,14 +1791,25 @@ test_executorch() { popd +<<<<<<< HEAD +======= + # Test torchgen generated code for Executorch. + echo "Testing ExecuTorch op registration" + "$BUILD_BIN_DIR"/test_edge_op_registration + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) assert_git_not_dirty } test_linux_aarch64() { python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ +<<<<<<< HEAD test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \ distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ +======= + test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose # Dynamo tests @@ -1621,12 +1835,19 @@ test_operator_benchmark() { TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" TEST_DIR=$(pwd) +<<<<<<< HEAD ARCH=$(uname -m) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_inductor_set_cpu_affinity cd benchmarks/operator_benchmark/pt_extension +<<<<<<< HEAD python -m pip install . -v --no-build-isolation +======= + python setup.py install +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cd "${TEST_DIR}"/benchmarks/operator_benchmark $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \ @@ -1636,6 +1857,7 @@ test_operator_benchmark() { pip_install pandas python check_perf_csv.py \ --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \ +<<<<<<< HEAD --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv" } @@ -1658,6 +1880,11 @@ test_operator_microbenchmark() { --benchmark-name "PyTorch operator microbenchmark" done } +======= + --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") @@ -1665,6 +1892,7 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze fi if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then # Install numpy-2.0.2 and compatible scipy & numba versions +<<<<<<< HEAD # Force re-install of pandas to avoid error where pandas checks numpy version from initial install and fails upon import TMP_PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)" 2>/dev/null) if [ -n "$TMP_PANDAS_VERSION" ]; then @@ -1674,6 +1902,11 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then fi python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then +======= + python -mpip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0 + python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py +elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test_linux_aarch64 elif [[ "${TEST_CONFIG}" == *backward* ]]; then test_forward_backward_compatibility @@ -1682,16 +1915,22 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then install_torchvision build_xla test_xla +<<<<<<< HEAD elif [[ "$TEST_CONFIG" == *vllm* ]]; then echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST" (cd .ci/lumen_cli && python -m pip install -e .) python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *executorch* ]]; then test_executorch elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then test_python_legacy_jit +<<<<<<< HEAD elif [[ "$TEST_CONFIG" == 'quantization' ]]; then test_quantization +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" @@ -1714,8 +1953,11 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then test_operator_benchmark cpu ${TEST_MODE} fi +<<<<<<< HEAD elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then test_operator_microbenchmark +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then @@ -1724,8 +1966,11 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then test_inductor_triton_cpu elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then test_inductor_micro_benchmark +<<<<<<< HEAD elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then test_inductor_aoti_cross_compile_for_windows +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then install_torchvision id=$((SHARD_NUMBER-1)) @@ -1735,6 +1980,7 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then id=$((SHARD_NUMBER-1)) test_dynamo_benchmark timm_models "$id" elif [[ "${TEST_CONFIG}" == cachebench ]]; then +<<<<<<< HEAD install_torchaudio install_torchvision PYTHONPATH=/torchbench test_cachebench @@ -1745,21 +1991,56 @@ elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then install_torchaudio install_torchvision +======= + install_torchaudio cuda + install_torchvision + checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco + PYTHONPATH=$(pwd)/torchbench test_cachebench +elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then + install_torchaudio cpu + install_torchvision + checkout_install_torchbench nanogpt + PYTHONPATH=$(pwd)/torchbench test_verify_cachebench +elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then + if [[ "${TEST_CONFIG}" == *cpu* ]]; then + install_torchaudio cpu + else + install_torchaudio cuda + fi + install_torchvision + TORCH_CUDA_ARCH_LIST="8.0;8.6" install_torchao +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id=$((SHARD_NUMBER-1)) # https://github.com/opencv/opencv-python/issues/885 pip_install opencv-python==4.8.0.74 if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then +<<<<<<< HEAD PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest else +======= + checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer + PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf + elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then + checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \ + llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \ + functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0 + PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf + elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then + checkout_install_torchbench + TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest + else + checkout_install_torchbench +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Do this after checkout_install_torchbench to ensure we clobber any # nightlies that torchbench may pull in if [[ "${TEST_CONFIG}" != *cpu* ]]; then install_torchrec_and_fbgemm fi +<<<<<<< HEAD PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id" fi elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then @@ -1771,6 +2052,24 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then elif [[ "${TEST_CONFIG}" == *inductor* ]]; then install_torchvision test_inductor_shard "${SHARD_NUMBER}" +======= + PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id" + fi +elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then + install_torchaudio cuda + install_torchvision + checkout_install_torchbench hf_T5 llama moco + PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER" + test_inductor_aoti +elif [[ "${TEST_CONFIG}" == *inductor* ]]; then + install_torchvision + test_inductor_shard "${SHARD_NUMBER}" + if [[ "${SHARD_NUMBER}" == 1 ]]; then + if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then + test_inductor_distributed + fi + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif [[ "${TEST_CONFIG}" == *einops* ]]; then test_einops elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then @@ -1820,6 +2119,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then test_xpu_bin elif [[ "${TEST_CONFIG}" == smoke ]]; then test_python_smoke +<<<<<<< HEAD elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then test_python_smoke_b200 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then @@ -1830,6 +2130,10 @@ elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then test_h100_symm_mem elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then test_h100_cutlass_backend +======= +elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then + test_h100_distributed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else install_torchvision install_monkeytype diff --git a/.ci/pytorch/test_example_code/CMakeLists.txt b/.ci/pytorch/test_example_code/CMakeLists.txt index e87f37ae61fb4..688395d1615d9 100644 --- a/.ci/pytorch/test_example_code/CMakeLists.txt +++ b/.ci/pytorch/test_example_code/CMakeLists.txt @@ -16,7 +16,11 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse find_library(CUDNN_LIBRARY NAMES cudnn) target_link_libraries(simple-torch-test ${CUDNN_LIBRARY} ) if(MSVC) +<<<<<<< HEAD file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll") +======= + file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) message("dlls to copy " ${TORCH_DLLS}) add_custom_command(TARGET simple-torch-test POST_BUILD diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 240cc8b559322..960942000d714 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -38,13 +38,20 @@ if errorlevel 1 goto fail if not errorlevel 0 goto fail :: Update CMake +<<<<<<< HEAD :: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9 if errorlevel 1 goto fail if not errorlevel 0 goto fail +<<<<<<< HEAD :: TODO: Move to .ci/docker/requirements-ci.txt call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0 +======= +call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 goto fail if not errorlevel 0 goto fail @@ -63,10 +70,16 @@ if "%USE_XPU%"=="1" ( call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat" call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat" if errorlevel 1 exit /b 1 +<<<<<<< HEAD :: Reduce build time SET TORCH_XPU_ARCH_LIST=bmg :: Re-setup python env for build call pip install -r requirements.txt +======= + :: Reduce build time. Only have MTL self-hosted runner now + SET TORCH_XPU_ARCH_LIST=xe-lpg + SET USE_KINETO=0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) @echo on @@ -132,14 +145,22 @@ if "%USE_CUDA%"=="1" ( :: Print all existing environment variable for debugging set +<<<<<<< HEAD python -m build --wheel --no-isolation +======= +python setup.py bdist_wheel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 goto fail if not errorlevel 0 goto fail sccache --show-stats python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" ( if "%BUILD_ENVIRONMENT%"=="" ( +<<<<<<< HEAD echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash. +======= + echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) else ( copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%" diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat index abd2c8722b11d..09c66282f04d2 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat @@ -3,12 +3,20 @@ if "%BUILD_ENVIRONMENT%"=="" ( ) else ( set CONDA_PARENT_DIR=C:\Jenkins ) +<<<<<<< HEAD set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3 +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :: Be conservative here when rolling out the new AMI with conda. This will try :: to install conda as before if it couldn't find the conda installation. This :: can be removed eventually after we gain enough confidence in the AMI +<<<<<<< HEAD if not exist %CONDA_ROOT_DIR% ( +======= +if not exist %CONDA_PARENT_DIR%\Miniconda3 ( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set INSTALL_FRESH_CONDA=1 ) @@ -17,14 +25,22 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( if errorlevel 1 exit /b if not errorlevel 0 exit /b +<<<<<<< HEAD %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% +======= + %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if errorlevel 1 exit /b if not errorlevel 0 exit /b ) :: Activate conda so that we can use its commands, i.e. conda, python, pip +<<<<<<< HEAD call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% :: Activate conda so that we can use its commands, i.e. conda, python, pip call conda activate py_tmp call pip install -r .ci/docker/requirements-ci.txt +======= +call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat index 3173582b06f45..928fc58113ca6 100644 --- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -14,7 +14,11 @@ if not errorlevel 0 exit /b :: build\torch. Rather than changing all these references, making a copy of torch folder :: from conda to the current workspace is easier. The workspace will be cleaned up after :: the job anyway +<<<<<<< HEAD xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ +======= +xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pushd . if "%VC_VERSION%" == "" ( diff --git a/.ci/pytorch/win-test-helpers/test_libtorch.bat b/.ci/pytorch/win-test-helpers/test_libtorch.bat index d6ecd72188760..68dd3d4c28ec4 100644 --- a/.ci/pytorch/win-test-helpers/test_libtorch.bat +++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat @@ -15,6 +15,7 @@ if errorlevel 1 exit /b 1 if not errorlevel 0 exit /b 1 cd %TMP_DIR_WIN%\build\torch\test +<<<<<<< HEAD :: Enable delayed variable expansion to make the list setlocal enabledelayedexpansion @@ -44,6 +45,39 @@ if errorlevel 1 goto fail if not errorlevel 0 goto fail goto :eof +======= +for /r "." %%a in (*.exe) do ( + call :libtorch_check "%%~na" "%%~fa" + if errorlevel 1 goto fail +) + +goto :eof + +:libtorch_check + +cd %CWD% +set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test + +:: Skip verify_api_visibility as it a compile level test +if "%~1" == "verify_api_visibility" goto :eof + +echo Running "%~2" +if "%~1" == "c10_intrusive_ptr_benchmark" ( + :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp + call "%~2" + goto :eof +) + +python test\run_test.py --cpp --verbose -i "cpp/%~1" +if errorlevel 1 ( + echo %1 failed with exit code %errorlevel% + goto fail +) +if not errorlevel 0 ( + echo %1 failed with exit code %errorlevel% + goto fail +) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :eof exit /b 0 diff --git a/.ci/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat index 02829ee369757..f0489db1875f9 100644 --- a/.ci/pytorch/win-test-helpers/test_python_shard.bat +++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat @@ -25,7 +25,11 @@ echo Copying over test times file robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" echo Run nn tests +<<<<<<< HEAD python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose +======= +python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ERRORLEVEL 1 goto fail popd diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index a01aa0b6431cd..b7de62bfe553f 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -37,8 +37,25 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" fi +<<<<<<< HEAD # TODO: Move this to .ci/docker/requirements-ci.txt python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2" +======= +# TODO: Move both of them to Windows AMI +python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 + +# Install Z3 optional dependency for Windows builds. +python -m pip install z3-solver==4.12.2.0 + +# Install tlparse for test\dynamo\test_structured_trace.py UTs. +python -m pip install tlparse==0.3.30 + +# Install parameterized +python -m pip install parameterized==0.8.1 + +# Install pulp for testing ilps under torch\distributed\_tools +python -m pip install pulp==2.9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run_tests() { # Run nvidia-smi if available diff --git a/.ci/pytorch/windows/arm64/build_pytorch.bat b/.ci/pytorch/windows/arm64/build_pytorch.bat index b5c2ef65b84ad..dfcc8b7fd47af 100644 --- a/.ci/pytorch/windows/arm64/build_pytorch.bat +++ b/.ci/pytorch/windows/arm64/build_pytorch.bat @@ -48,7 +48,11 @@ sccache --zero-stats sccache --show-stats :: Call PyTorch build script +<<<<<<< HEAD python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%" +======= +python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :: show sccache stats sccache --show-stats diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat index efb8cfec63e7e..2db616810ecb6 100644 --- a/.ci/pytorch/windows/cuda126.bat +++ b/.ci/pytorch/windows/cuda126.bat @@ -18,6 +18,7 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +<<<<<<< HEAD IF "%NVTOOLSEXT_PATH%"=="" ( IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt @@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" ( ) ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IF "%CUDA_PATH_V126%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" ( set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6" @@ -37,7 +40,11 @@ IF "%CUDA_PATH_V126%"=="" ( ) IF "%BUILD_VISION%" == "" ( +<<<<<<< HEAD set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0 +======= + set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat index bbd349e2efb4b..0234ec324c039 100644 --- a/.ci/pytorch/windows/cuda128.bat +++ b/.ci/pytorch/windows/cuda128.bat @@ -18,6 +18,7 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +<<<<<<< HEAD IF "%NVTOOLSEXT_PATH%"=="" ( IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt @@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" ( ) ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IF "%CUDA_PATH_V128%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" ( set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8" @@ -37,10 +40,17 @@ IF "%CUDA_PATH_V128%"=="" ( ) IF "%BUILD_VISION%" == "" ( +<<<<<<< HEAD set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0 set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 +======= + set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0 + set TORCH_NVCC_FLAGS=-Xfatbin -compress-all +) ELSE ( + set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) set "CUDA_PATH=%CUDA_PATH_V128%" diff --git a/.ci/pytorch/windows/cuda129.bat b/.ci/pytorch/windows/cuda129.bat index b17e6113c63e2..ad19af5363c3c 100644 --- a/.ci/pytorch/windows/cuda129.bat +++ b/.ci/pytorch/windows/cuda129.bat @@ -18,6 +18,7 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +<<<<<<< HEAD IF "%NVTOOLSEXT_PATH%"=="" ( IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt @@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" ( ) ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IF "%CUDA_PATH_V129%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" ( set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat index e0281c0d78a44..993f11e1e0142 100644 --- a/.ci/pytorch/windows/internal/copy.bat +++ b/.ci/pytorch/windows/internal/copy.bat @@ -1,3 +1,4 @@ +<<<<<<< HEAD if %CUDA_VERSION% geq 130 ( set "dll_path=bin\x64" @@ -19,6 +20,19 @@ copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib +======= +copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib + +copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib :: Should be set in build_pytorch.bat @@ -28,3 +42,11 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib if exist "C:\Windows\System32\zlibwapi.dll" ( copy "C:\Windows\System32\zlibwapi.dll" pytorch\torch\lib ) +<<<<<<< HEAD +======= + +::copy nvJitLink dll is requires for cuda 12+ +if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" ( + copy "%CUDA_PATH%\bin\nvJitLink_*.dll*" pytorch\torch\lib +) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index 1349d3e661f55..b17eda7de7815 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -26,7 +26,10 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR% if %CUDA_VER% EQU 126 goto cuda126 if %CUDA_VER% EQU 128 goto cuda128 if %CUDA_VER% EQU 129 goto cuda129 +<<<<<<< HEAD if %CUDA_VER% EQU 130 goto cuda130 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo CUDA %CUDA_VERSION_STR% is not supported exit /b 1 @@ -114,6 +117,7 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" goto cuda_common +<<<<<<< HEAD :cuda130 set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe @@ -141,17 +145,22 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" goto cuda_common +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :cuda_common :: NOTE: We only install CUDA if we don't have it installed already. :: With GHA runners these should be pre-installed as part of our AMI process :: If you cannot find the CUDA version you want to build for here then please :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" ( +<<<<<<< HEAD if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" ( curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z" if errorlevel 1 exit /b 1 ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" ( curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" if errorlevel 1 exit /b 1 @@ -178,6 +187,7 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations" ) +<<<<<<< HEAD echo Installing NvToolsExt... 7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt" mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" @@ -187,6 +197,8 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo Installing cuDNN... 7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn" xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" @@ -217,4 +229,7 @@ echo Setting up environment... set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%" set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" +<<<<<<< HEAD set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat index 2c173aed818b4..f9ffb6de2fd29 100644 --- a/.ci/pytorch/windows/internal/driver_update.bat +++ b/.ci/pytorch/windows/internal/driver_update.bat @@ -1,3 +1,4 @@ +<<<<<<< HEAD set WIN_DRIVER_VN=580.88 set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe @@ -7,3 +8,14 @@ start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-inte if errorlevel 1 exit /b 1 del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL +======= +set WIN_DRIVER_VN=528.89 +set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore +curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe +if errorlevel 1 exit /b 1 + +start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot +if errorlevel 1 exit /b 1 + +del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat index b2f68af97b3f4..1ab15fc544cff 100644 --- a/.ci/pytorch/windows/internal/install_python.bat +++ b/.ci/pytorch/windows/internal/install_python.bat @@ -1,12 +1,16 @@ set ADDITIONAL_OPTIONS="" set PYTHON_EXEC="python" +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if "%DESIRED_PYTHON%" == "3.13t" ( echo Python version is set to 3.13t set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe" set ADDITIONAL_OPTIONS="Include_freethreaded=1" set PYTHON_EXEC="python3.13t" +<<<<<<< HEAD ) else if "%DESIRED_PYTHON%"=="3.14t" ( echo Python version is set to 3.14 or 3.14t set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe" @@ -14,6 +18,10 @@ if "%DESIRED_PYTHON%" == "3.13t" ( set PYTHON_EXEC="python3.14t" ) else ( echo Python version is set to %DESIRED_PYTHON% +======= +) else ( + echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON% +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =% ) @@ -25,5 +33,8 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t if errorlevel 1 exit /b 1 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%" +<<<<<<< HEAD %PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build if errorlevel 1 exit /b 1 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/internal/setup.bat b/.ci/pytorch/windows/internal/setup.bat index 34a5140cb1ee0..bd7627ed7bede 100644 --- a/.ci/pytorch/windows/internal/setup.bat +++ b/.ci/pytorch/windows/internal/setup.bat @@ -86,7 +86,11 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_ goto build_end :pytorch +<<<<<<< HEAD %PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%" +======= +%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) :build_end IF ERRORLEVEL 1 exit /b 1 diff --git a/.ci/pytorch/windows/internal/smoke_test.bat b/.ci/pytorch/windows/internal/smoke_test.bat index f671a9d0e0abb..eb803a058cae0 100644 --- a/.ci/pytorch/windows/internal/smoke_test.bat +++ b/.ci/pytorch/windows/internal/smoke_test.bat @@ -148,7 +148,18 @@ if "%NVIDIA_GPU_EXISTS%" == "0" ( goto end ) +<<<<<<< HEAD cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ +======= +set BUILD_SPLIT_CUDA= +if exist "%install_root%\lib\torch_cuda_cu.lib" if exist "%install_root%\lib\torch_cuda_cpp.lib" set BUILD_SPLIT_CUDA=ON + +if "%BUILD_SPLIT_CUDA%" == "ON" ( + cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda_cu.lib torch_cuda_cpp.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ /INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z +) else ( + cl %PYTORCH_ROOT%\.ci\pytorch\test_example_code\check-torch-cuda.cpp torch_cpu.lib c10.lib torch_cuda.lib /EHsc /std:c++17 /link /INCLUDE:?warp_size@cuda@at@@YAHXZ +) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .\check-torch-cuda.exe if ERRORLEVEL 1 exit /b 1 diff --git a/.ci/pytorch/windows/internal/static_lib_test.bat b/.ci/pytorch/windows/internal/static_lib_test.bat index cd1fc484ae155..f06b3a7c0b41e 100644 --- a/.ci/pytorch/windows/internal/static_lib_test.bat +++ b/.ci/pytorch/windows/internal/static_lib_test.bat @@ -63,7 +63,11 @@ if errorlevel 1 exit /b 1 call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 +<<<<<<< HEAD call conda install -y -q -c conda-forge libuv=1.51 +======= +call conda install -y -q -c conda-forge libuv=1.39 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) call conda install -y -q intel-openmp echo "install and test libtorch" diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat index f143571a56922..85b72caccaba0 100644 --- a/.ci/pytorch/windows/internal/xpu_install.bat +++ b/.ci/pytorch/windows/internal/xpu_install.bat @@ -13,9 +13,15 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" :xpu_bundle_install_start set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI +<<<<<<< HEAD set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product set XPU_BUNDLE_VERSION=2025.1.3+5 +======= +set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe +set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product +set XPU_BUNDLE_VERSION=2025.0.1+20 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set XPU_BUNDLE_INSTALLED=0 set XPU_BUNDLE_UNINSTALL=0 set XPU_EXTRA_URL=NULL @@ -24,9 +30,15 @@ set XPU_EXTRA_VERSION=2025.0.1+1226 set XPU_EXTRA_INSTALLED=0 set XPU_EXTRA_UNINSTALL=0 +<<<<<<< HEAD if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] ( set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe set XPU_BUNDLE_VERSION=2025.2.1+20 +======= +if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] ( + set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe + set XPU_BUNDLE_VERSION=2025.1.3+5 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) :: Check if XPU bundle is target version or already installed @@ -90,3 +102,17 @@ if errorlevel 1 exit /b 1 del xpu_extra.exe :xpu_install_end +<<<<<<< HEAD +======= + +if not "%XPU_ENABLE_KINETO%"=="1" goto install_end +:: Install Level Zero SDK +set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip +curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip" +echo "Installing level zero SDK..." +7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero" +set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%" +del "%SRC_DIR%\temp_build\level_zero_sdk.zip" + +:install_end +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat index a7addd5d712d0..279b65b19b939 100644 --- a/.ci/pytorch/windows/setup_build.bat +++ b/.ci/pytorch/windows/setup_build.bat @@ -7,8 +7,11 @@ call "internal\install_python.bat" %PYTHON_EXEC% --version set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%" +<<<<<<< HEAD if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake @@ -18,7 +21,11 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake %PYTHON_EXEC% -m pip install pyyaml %PYTHON_EXEC% -m pip install mkl-include mkl-static +<<<<<<< HEAD %PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0 +======= +%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) where cmake.exe diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index 6123e8abc8c0c..1adf8e63941ef 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -85,7 +85,11 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true # Create an isolated directory to store this builds pytorch checkout and conda # installation if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then +<<<<<<< HEAD MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)" +======= + MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi mkdir -p "$MAC_PACKAGE_WORK_DIR" || true if [[ -n ${GITHUB_ACTIONS} ]]; then @@ -96,11 +100,19 @@ fi whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist" mkdir -p "$whl_tmp_dir" +<<<<<<< HEAD mac_version='macosx-11_0-arm64' libtorch_arch='arm64' # Create a consistent wheel package name to rename the wheel to wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl" +======= +mac_version='macosx_11_0_arm64' +libtorch_arch='arm64' + +# Create a consistent wheel package name to rename the wheel to +wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ########################################################### @@ -124,12 +136,21 @@ popd export TH_BINARY_BUILD=1 export INSTALL_TEST=0 # dont install test binaries into site-packages +<<<<<<< HEAD export MACOSX_DEPLOYMENT_TARGET=11.0 +======= +export MACOSX_DEPLOYMENT_TARGET=10.15 +export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + +SETUPTOOLS_PINNED_VERSION="=46.0.0" +PYYAML_PINNED_VERSION="=5.3" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXTRA_CONDA_INSTALL_FLAGS="" CONDA_ENV_CREATE_FLAGS="" RENAME_WHEEL=true case $desired_python in +<<<<<<< HEAD 3.14t) echo "Using 3.14 deps" mac_version='macosx-11.0-arm64' @@ -146,10 +167,21 @@ case $desired_python in echo "Using 3.13t deps" mac_version='macosx-11.0-arm64' NUMPY_PINNED_VERSION="==2.1.0" +======= + 3.13t) + echo "Using 3.13 deps" + SETUPTOOLS_PINNED_VERSION=">=68.0.0" + PYYAML_PINNED_VERSION=">=6.0.1" + NUMPY_PINNED_VERSION="=2.1.0" + CONDA_ENV_CREATE_FLAGS="python-freethreading" + EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" + desired_python="3.13" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RENAME_WHEEL=false ;; 3.13) echo "Using 3.13 deps" +<<<<<<< HEAD NUMPY_PINNED_VERSION="==2.1.0" ;; 3.12) @@ -176,21 +208,87 @@ PINNED_PACKAGES=( python -mvenv ~/${desired_python}-build source ~/${desired_python}-build/bin/activate retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt" +======= + SETUPTOOLS_PINNED_VERSION=">=68.0.0" + PYYAML_PINNED_VERSION=">=6.0.1" + NUMPY_PINNED_VERSION="=2.1.0" + ;; + 3.12) + echo "Using 3.12 deps" + SETUPTOOLS_PINNED_VERSION=">=68.0.0" + PYYAML_PINNED_VERSION=">=6.0.1" + NUMPY_PINNED_VERSION="=2.0.2" + ;; + 3.11) + echo "Using 3.11 deps" + SETUPTOOLS_PINNED_VERSION=">=46.0.0" + PYYAML_PINNED_VERSION=">=5.3" + NUMPY_PINNED_VERSION="=2.0.2" + ;; + 3.10) + echo "Using 3.10 deps" + SETUPTOOLS_PINNED_VERSION=">=46.0.0" + PYYAML_PINNED_VERSION=">=5.3" + NUMPY_PINNED_VERSION="=2.0.2" + ;; + 3.9) + echo "Using 3.9 deps" + SETUPTOOLS_PINNED_VERSION=">=46.0.0" + PYYAML_PINNED_VERSION=">=5.3" + NUMPY_PINNED_VERSION="=2.0.2" + ;; + *) + echo "Using default deps" + NUMPY_PINNED_VERSION="=1.11.3" + ;; +esac + +# Install into a fresh env +tmp_env_name="wheel_py$python_nodot" +conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} +source activate "$tmp_env_name" + +pip install "numpy=${NUMPY_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions +retry pip install -r "${pytorch_rootdir}/requirements.txt" || true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retry brew install libomp # For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule export USE_DISTRIBUTED=1 +<<<<<<< HEAD +======= +if [[ -n "$CROSS_COMPILE_ARM64" ]]; then + export CMAKE_OSX_ARCHITECTURES=arm64 +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) export USE_MKLDNN=OFF export USE_QNNPACK=OFF export BUILD_TEST=OFF pushd "$pytorch_rootdir" +<<<<<<< HEAD echo "Calling -m build --wheel --no-isolation at $(date)" _PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}" echo "Finished -m build --wheel --no-isolation at $(date)" +======= +echo "Calling setup.py bdist_wheel at $(date)" + +if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" + BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir" + echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)" + echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" + BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir" + echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" +else + python setup.py bdist_wheel -d "$whl_tmp_dir" +fi + +echo "Finished setup.py bdist_wheel at $(date)" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ $package_type != 'libtorch' ]]; then echo "delocating wheel dependencies" diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index c24a50b8b17ed..0af272b341abf 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -65,8 +65,21 @@ fi if [[ "$PACKAGE_TYPE" != libtorch ]]; then if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then +<<<<<<< HEAD pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}" retry pip install -q numpy protobuf typing-extensions +======= + if [[ "$USE_SPLIT_BUILD" == "true" ]]; then + pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)" + pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)" + # todo: after folder is populated use the pypi_pkg channel instead + pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg" + retry pip install -q numpy protobuf typing-extensions + else + pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}" + retry pip install -q numpy protobuf typing-extensions + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else pip install "\$pkg" retry pip install -q numpy protobuf typing-extensions diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 3f747e1a186ae..30a2daafd6c93 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -5,7 +5,13 @@ export TZ=UTC tagged_version() { GIT_DIR="${workdir}/pytorch/.git" GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*" +<<<<<<< HEAD if [[ ! -d "${GIT_DIR}" ]]; then +======= + if [[ -n "${CIRCLE_TAG:-}" ]]; then + echo "${CIRCLE_TAG}" + elif [[ ! -d "${GIT_DIR}" ]]; then +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "Abort, abort! Git dir ${GIT_DIR} does not exists!" kill $$ elif ${GIT_DESCRIBE} --exact >/dev/null; then @@ -73,7 +79,18 @@ export PYTORCH_BUILD_NUMBER=1 : <<'BLOCK_COMMENT' # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) +<<<<<<< HEAD TRITON_CONSTRAINT="platform_system == 'Linux'" +======= + +# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT +TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" + +# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries. +if [[ "$DESIRED_CUDA" == "cu129" ]]; then + TRITON_CONSTRAINT="platform_system == 'Linux'" +fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" @@ -86,10 +103,17 @@ fi # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then +<<<<<<< HEAD TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt) TRITON_REQUIREMENT="triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}" +======= + TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" + if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then + TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt) + TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}" @@ -130,6 +154,10 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}" export DESIRED_CUDA="$DESIRED_CUDA" export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}" export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}" +<<<<<<< HEAD +======= +export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if [[ "${OSTYPE}" == "msys" ]]; then export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}" if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then @@ -166,6 +194,7 @@ if [[ "$(uname)" != Darwin ]]; then MEMORY_LIMIT_MAX_JOBS=12 NUM_CPUS=$(( $(nproc) - 2 )) +<<<<<<< HEAD if [[ "$(uname)" == Linux ]]; then # Defaults here for **binary** linux builds so they can be changed in one place export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))} @@ -173,6 +202,10 @@ if [[ "$(uname)" != Darwin ]]; then # For other builds export MAX_JOBS=${NUM_CPUS} fi +======= + # Defaults here for **binary** linux builds so they can be changed in one place + export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cat >>"$envfile" <>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # this is special build with all dependencies packaged if [[ ${BUILD_NAME} == *-full* ]]; then UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full" @@ -51,12 +58,23 @@ s3_upload() { s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/" fi ( +<<<<<<< HEAD +======= + cache_control_flag="" + if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then + cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'" + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for pkg in ${PKG_DIR}/*.${extension}; do ( set -x shm_id=$(sha256sum "${pkg}" | awk '{print $1}') ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \ +<<<<<<< HEAD --metadata "checksum-sha256=${shm_id}" +======= + --metadata "checksum-sha256=${shm_id}" ${cache_control_flag} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) done ) diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh index 18dcde50e2b65..60ffb1e15a817 100644 --- a/.circleci/scripts/binary_windows_build.sh +++ b/.circleci/scripts/binary_windows_build.sh @@ -15,7 +15,12 @@ fi if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 export USE_SCCACHE=0 +<<<<<<< HEAD export XPU_VERSION=2025.2 +======= + export XPU_VERSION=2025.1 + export XPU_ENABLE_KINETO=1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi echo "Free space on filesystem before build:" diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh index 9326d9037e8b3..eb5b15b762cd1 100644 --- a/.circleci/scripts/binary_windows_test.sh +++ b/.circleci/scripts/binary_windows_test.sh @@ -8,7 +8,11 @@ export VC_YEAR=2022 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 +<<<<<<< HEAD export XPU_VERSION=2025.2 +======= + export XPU_VERSION=2025.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi pushd "$PYTORCH_ROOT/.ci/pytorch/" diff --git a/.circleci/scripts/functorch_doc_push_script.sh b/.circleci/scripts/functorch_doc_push_script.sh new file mode 100755 index 0000000000000..010956e212520 --- /dev/null +++ b/.circleci/scripts/functorch_doc_push_script.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# =================== The following code **should** be executed inside Docker container =================== + +# Install dependencies +sudo apt-get -y update +sudo apt-get -y install expect-dev + +# This is where the local pytorch install in the docker image is located +pt_checkout="/var/lib/jenkins/workspace" +source "$pt_checkout/.ci/pytorch/common_utils.sh" +echo "functorch_doc_push_script.sh: Invoked with $*" + +set -ex + +version=${DOCS_VERSION:-nightly} +echo "version: $version" + +# Build functorch docs +pushd $pt_checkout/functorch/docs +pip -q install -r requirements.txt +make html +popd + +git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages +pushd functorch_ghpages + +if [ $version == "main" ]; then + version=nightly +fi + +git rm -rf "$version" || true +mv "$pt_checkout/functorch/docs/build/html" "$version" + +git add "$version" || true +git status +git config user.email "soumith+bot@pytorch.org" +git config user.name "pytorchbot" +# If there aren't changes, don't make a commit; push is no-op +git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true +git status + +if [[ "${WITH_PUSH:-}" == true ]]; then + git push -u origin gh-pages +fi + +popd +# =================== The above code **should** be executed inside Docker container =================== diff --git a/.clang-format b/.clang-format index 67b722d967c7e..448aa5d0f343d 100644 --- a/.clang-format +++ b/.clang-format @@ -120,7 +120,10 @@ UseTab: Never Language: ObjC ColumnLimit: 120 AlignAfterOpenBracket: Align +<<<<<<< HEAD IndentWidth: 2 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false diff --git a/.clang-tidy b/.clang-tidy index 71ffdf8cb224c..e85101e531adf 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -59,19 +59,31 @@ performance-*, -performance-enum-size, readability-container-size-empty, readability-delete-null-pointer, +<<<<<<< HEAD readability-duplicate-include, readability-misplaced-array-index, readability-redundant*, +======= +readability-duplicate-include +readability-misplaced-array-index, +readability-redundant* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) readability-simplify-subscript-expr, readability-string-compare, -readability-redundant-access-specifiers, -readability-redundant-control-flow, +<<<<<<< HEAD -readability-redundant-inline-specifier, ' HeaderFilterRegex: '^(aten/|c10/|torch/).*$' WarningsAsErrors: '*' LineFilter: - name: '/usr/include/.*' +======= +' +HeaderFilterRegex: '^(aten/|c10/|torch/).*$' +WarningsAsErrors: '*' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CheckOptions: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true diff --git a/.devcontainer/README.md b/.devcontainer/README.md index 7ef8da027ad9e..c7e65eeecedd1 100644 --- a/.devcontainer/README.md +++ b/.devcontainer/README.md @@ -61,8 +61,13 @@ You are now all set to start developing with PyTorch in a DevContainer environme ## Step 8: Build PyTorch To build pytorch from source, simply run: +<<<<<<< HEAD ```bash python -m pip install --no-build-isolation -v -e . +======= + ``` + python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time. diff --git a/.editorconfig b/.editorconfig index e9581612a050e..0456b5cd51a07 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,11 +1,15 @@ root = true [*] +<<<<<<< HEAD charset = utf-8 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) end_of_line = lf insert_final_newline = true # Python +<<<<<<< HEAD [*.{py,pyi,py.in,pyi.in}] indent_style = space indent_size = 4 @@ -34,3 +38,12 @@ indent_style = tab indent_style = space indent_size = 2 end_of_line = crlf +======= +[*.py] +indent_style = space +indent_size = 4 + +# Make +[Makefile] +indent_style = tab +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.flake8 b/.flake8 index 937234edb4036..c3534a7e3db70 100644 --- a/.flake8 +++ b/.flake8 @@ -7,12 +7,24 @@ max-line-length = 120 # C408 ignored because we like the dict keyword argument syntax # E501 is not flexible enough, we're using B950 instead ignore = +<<<<<<< HEAD E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824, +======= + E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # shebang has extra meaning in fbcode lints, so I think it's not worth trying # to line this up with executable bit EXE001, # these ignores are from flake8-bugbear; please fix! +<<<<<<< HEAD B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910 +======= + B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907 + # these ignores are from flake8-comprehensions; please fix! + C407, + # these ignores are from flake8-logging-format; please fix! + G100,G101,G200 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # these ignores are from flake8-simplify. please fix or ignore with commented reason SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12, # SIM104 is already covered by pyupgrade ruff @@ -44,7 +56,10 @@ per-file-ignores = torch/__init__.py: F401,TOR901 torch/_custom_op/impl.py: TOR901 torch/_export/serde/upgrade.py: TOR901 +<<<<<<< HEAD torch/_functorch/predispatch.py: TOR901 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch/_functorch/vmap.py: TOR901 torch/_inductor/test_operators.py: TOR901 torch/_library/abstract_impl.py: TOR901 @@ -69,7 +84,11 @@ exclude = ./docs/src, ./functorch/docs, ./functorch/examples, +<<<<<<< HEAD ./functorch/docs/source/tutorials, +======= + ./functorch/notebooks, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ./scripts, ./test/generated_type_hints_smoketest.py, ./third_party, diff --git a/.github/ISSUE_TEMPLATE/ci-sev.md b/.github/ISSUE_TEMPLATE/ci-sev.md index 1ed74161f55de..a149b5cff4fd9 100644 --- a/.github/ISSUE_TEMPLATE/ci-sev.md +++ b/.github/ISSUE_TEMPLATE/ci-sev.md @@ -1,6 +1,7 @@ --- name: "⚠️ CI SEV" about: Tracking incidents for PyTorch's CI infra. +<<<<<<< HEAD title: '' labels: '' assignees: '' @@ -9,6 +10,11 @@ assignees: '' > NOTE: Remember to label this issue with "`ci: sev`" > If you want autorevert to be disabled, keep the ci: disable-autorevert label +======= +--- + +> NOTE: Remember to label this issue with "`ci: sev`" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md index d9e0cc22bd3f5..5379c662410d6 100644 --- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md +++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md @@ -1,10 +1,15 @@ --- name: Disable CI jobs (PyTorch Dev Infra only) about: Use this template to disable CI jobs +<<<<<<< HEAD title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME] labels: 'module: ci' assignees: '' +======= +title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]" +labels: "module: ci" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --- > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index d021371ca8863..dccb8b3f6e4ef 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -12,19 +12,26 @@ self-hosted-runner: - linux.9xlarge.ephemeral - am2.linux.9xlarge.ephemeral - linux.12xlarge +<<<<<<< HEAD - linux.12xlarge.memory - linux.24xlarge - linux.24xlarge.memory +======= + - linux.24xlarge +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - linux.24xlarge.ephemeral - linux.24xlarge.amd - linux.arm64.2xlarge - linux.arm64.2xlarge.ephemeral - linux.arm64.m7g.4xlarge - linux.arm64.m7g.4xlarge.ephemeral +<<<<<<< HEAD - linux.arm64.r7g.12xlarge.memory - linux.aws.h100 - linux.aws.h100.4 - linux.aws.h100.8 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - linux.4xlarge.nvidia.gpu - linux.8xlarge.nvidia.gpu - linux.16xlarge.nvidia.gpu @@ -59,6 +66,7 @@ self-hosted-runner: - linux.rocm.gpu.mi250 - linux.rocm.gpu.2 - linux.rocm.gpu.4 +<<<<<<< HEAD # gfx942 runners - linux.rocm.gpu.gfx942.1 - linux.rocm.gpu.gfx942.2 @@ -66,6 +74,18 @@ self-hosted-runner: - rocm-docker # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors) - macos-m1-stable +======= + # MI300 runners + - linux.rocm.gpu.mi300.2 + - linux.rocm.gpu.mi300.4 + - rocm-docker + # Repo-specific Apple hosted runners + - macos-m1-ultra + - macos-m2-14 + # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors) + - macos-m1-stable + - macos-m1-13 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - macos-m1-14 # GitHub-hosted MacOS runners - macos-latest-xlarge diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml new file mode 100644 index 0000000000000..bccd42aa42f2c --- /dev/null +++ b/.github/actions/build-android/action.yml @@ -0,0 +1,78 @@ +name: build android + +description: build android for a specific arch + +inputs: + arch: + description: arch to build + required: true + arch-for-build-env: + description: | + arch to pass to build environment. + This is currently different than the arch name we use elsewhere, which + should be fixed. + required: true + github-secret: + description: github token + required: true + build-environment: + required: true + description: Top-level label for what's being built/tested. + docker-image: + required: true + description: Name of the base docker image to build with. + branch: + required: true + description: What branch we are building on. +outputs: + container_id: + description: Docker container identifier used to build the artifacts + value: ${{ steps.build.outputs.container_id }} + +runs: + using: composite + steps: + - name: Build-${{ inputs.arch }} + id: build + shell: bash + env: + BRANCH: ${{ inputs.branch }} + BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build" + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 + DOCKER_IMAGE: ${{ inputs.docker-image }} + MATRIX_ARCH: ${{ inputs.arch }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + set -exo pipefail + export container_name + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e SCCACHE_BUCKET \ + -e SCCACHE_REGION \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + git submodule sync && git submodule update -q --init --recursive --depth 1 + docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" + (echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1 + + # Copy install binaries back + mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}" + docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}" + echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}" diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml index 15f193ef3a5dc..b64267fdf45c3 100644 --- a/.github/actions/checkout-pytorch/action.yml +++ b/.github/actions/checkout-pytorch/action.yml @@ -57,6 +57,7 @@ runs: submodules: ${{ inputs.submodules }} show-progress: false +<<<<<<< HEAD - name: Clean submodules post checkout id: clean-submodules if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} @@ -72,6 +73,8 @@ runs: git submodule foreach --recursive git clean -ffdx fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Clean workspace (try again) if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }} diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml index 338fc0c2a844c..0fc3a4ac53048 100644 --- a/.github/actions/filter-test-configs/action.yml +++ b/.github/actions/filter-test-configs/action.yml @@ -70,7 +70,11 @@ runs: set -eux # PyYAML 6.0 doesn't work with MacOS x86 anymore # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2 +<<<<<<< HEAD python3 -m pip install requests==2.27.1 pyyaml==6.0.2 +======= + python3 -m pip install requests==2.27.1 pyyaml==6.0.1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Parse ref id: parse-ref @@ -125,7 +129,11 @@ runs: TAG: ${{ steps.parse-ref.outputs.tag }} EVENT_NAME: ${{ github.event_name }} SCHEDULE: ${{ github.event.schedule }} +<<<<<<< HEAD HEAD_BRANCH: ${{ steps.parse-ref.outputs.branch }} +======= + HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: filter run: | echo "Workflow: ${GITHUB_WORKFLOW}" diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml index f29d776402ba2..b0cfb1ce4e213 100644 --- a/.github/actions/linux-test/action.yml +++ b/.github/actions/linux-test/action.yml @@ -126,7 +126,11 @@ runs: shell: bash continue-on-error: true run: | +<<<<<<< HEAD python3 -m pip install psutil==5.9.8 nvidia-ml-py==11.525.84 +======= + python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python3 -m tools.stats.monitor > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" @@ -274,6 +278,11 @@ runs: -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" ) +<<<<<<< HEAD +======= + # Propagate download.pytorch.org IP to container + grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py index 48a8490985946..58fc16d3565be 100644 --- a/.github/actions/reuse-old-whl/reuse_old_whl.py +++ b/.github/actions/reuse-old-whl/reuse_old_whl.py @@ -264,7 +264,11 @@ def change_content_to_new_version(file: Union[str, Path]) -> None: change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") for file in Path(f"artifacts/dist/{old_stem}").glob( +<<<<<<< HEAD "*.dist-info/*", +======= + "*.dist-info/**", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ): change_content_to_new_version(file) @@ -304,7 +308,12 @@ def change_content_to_new_version(file: Union[str, Path]) -> None: def set_output() -> None: +<<<<<<< HEAD print("Setting output reuse=true") +======= + # Disable for now so we can monitor first + # pass +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if os.getenv("GITHUB_OUTPUT"): with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: print("reuse=true", file=env) diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index 4370549e4801a..2e8af1c65b4c6 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -28,10 +28,13 @@ runs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" +<<<<<<< HEAD - name: Print GPU info (if present) shell: bash run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Check if in a container runner shell: bash id: check_container_runner @@ -86,6 +89,40 @@ runs: # Prune all of the docker images docker system prune -af +<<<<<<< HEAD +======= + - name: Manually resolve download.pytorch.org + shell: bash + continue-on-error: true + run: | + set +e + set -x + + PT_DOMAIN=download.pytorch.org + # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400, + # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last + # one is returned at random + RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1) + + if [ -z "${RESOLVED_IP}" ]; then + echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..." + RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1) + + if [ -z "${RESOLVED_IP}" ]; then + echo "Couldn't resolve ${PT_DOMAIN}, exiting..." + exit 1 + fi + fi + + if grep -r "${PT_DOMAIN}" /etc/hosts; then + # Clean up any old records first + sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts + fi + + echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts + cat /etc/hosts + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Check that the docker daemon is running shell: bash continue-on-error: true diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml index f77c6267f5067..e60a9c83f267a 100644 --- a/.github/actions/setup-rocm/action.yml +++ b/.github/actions/setup-rocm/action.yml @@ -59,6 +59,14 @@ runs: echo "$msg" exit 1 fi +<<<<<<< HEAD +======= + if [[ $ngpu -eq 1 ]]; then + echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs" + echo "$msg" + exit 1 + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Runner diskspace health check uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main @@ -111,6 +119,7 @@ runs: # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries. # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary. echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: configure aws credentials id: aws_creds @@ -131,3 +140,5 @@ runs: env | grep '^GITHUB' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" env | grep '^RUNNER' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index 2ea330f93b490..90850b5551edb 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -6,12 +6,15 @@ inputs: cuda-version: description: which cuda version to install, 'cpu' for none required: true +<<<<<<< HEAD python-version: required: false type: string default: "3.10" description: | The python version to be used. Will be 3.10 by default +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runs: using: composite @@ -44,24 +47,34 @@ runs: CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" { +<<<<<<< HEAD echo "CONDA=${CONDA}"; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "CONDA_RUN=${CONDA} run --no-capture-output"; echo "CONDA_BUILD=${CONDA} run conda-build"; echo "CONDA_INSTALL=${CONDA} install"; } >> "${GITHUB_ENV}" - name: Setup Python3 +<<<<<<< HEAD env: PYTHON_VERSION: ${{ inputs.python-version }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shell: bash run: | set +e set -x +<<<<<<< HEAD # Create new py_tmp env with python-version ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) +======= + PYTHON3=$(${CONDA_RUN} which python3) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then @@ -74,7 +87,11 @@ runs: # installation, which is Python 3 based. Its Python is default to Python 3. Further, there # is also the Miniconda installation that is Python 2 based, and both can be installed if # needed. In both cases, Python binary is just called python +<<<<<<< HEAD PYTHON=$(${CONDA_RUN} -n py_tmp which python) +======= + PYTHON=$(${CONDA_RUN} which python) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then diff --git a/.github/actions/teardown-win/action.yml b/.github/actions/teardown-win/action.yml index b5e5f74db037a..8adc0c90a958b 100644 --- a/.github/actions/teardown-win/action.yml +++ b/.github/actions/teardown-win/action.yml @@ -23,6 +23,12 @@ runs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD +======= + - name: Clean up leftover processes on non-ephemeral Windows runner + uses: pytorch/test-infra/.github/actions/cleanup-runner@main + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Cleaning up Windows workspace sometimes fails flakily with device or resource busy # error, meaning one or more processes haven't stopped completely yet. So trying to # retry this step several time similar to how checkout-pytorch GHA does diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml index 991cf9fb87eff..6b682f2a768e3 100644 --- a/.github/actions/test-pytorch-binary/action.yml +++ b/.github/actions/test-pytorch-binary/action.yml @@ -24,6 +24,10 @@ runs: -e PYTORCH_FINAL_PACKAGE_DIR \ -e PYTORCH_ROOT \ -e SKIP_ALL_TESTS \ +<<<<<<< HEAD +======= + -e USE_SPLIT_BUILD \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --tty \ --detach \ -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ @@ -33,6 +37,13 @@ runs: ) echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" +<<<<<<< HEAD +======= + if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then + # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner + grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts" + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" # Generate test script diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 8af554d56ee57..9c375a2708937 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 69bbe7363897764f9e758d851cd0340147d27f94 +======= +4e94321c54617dd738a05bfedfc28bc0fa635b5c +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/fbgemm_rocm.txt b/.github/ci_commit_pins/fbgemm_rocm.txt index 19f5a2b2efa1a..aec0e9af23271 100644 --- a/.github/ci_commit_pins/fbgemm_rocm.txt +++ b/.github/ci_commit_pins/fbgemm_rocm.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 08ae0af1395c8d8471f4025deb6af9aef90b342f +======= +5fb5024118e9bb9decf96c2b0b1a8f0010bf56be +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt new file mode 100644 index 0000000000000..efbc3ceeb2afe --- /dev/null +++ b/.github/ci_commit_pins/torchbench.txt @@ -0,0 +1 @@ +e03a63be43e33596f7f0a43b0f530353785e4a59 diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index 5d9b8d5d171ef..794fb1c780dd9 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1,5 @@ +<<<<<<< HEAD 218d2ab791d437309f91e0486eb9fa7f00badc17 +======= +966da7e46f65d6d49df3e31214470a4fe5cc8e66 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 280d5ab77009f..df14012a202e5 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1,5 @@ +<<<<<<< HEAD df6798dfb931ce7c7fe5bed2447cd1092a5981af +======= +r2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml index 782696fc782d3..89cf6640a6c2f 100644 --- a/.github/label_to_label.yml +++ b/.github/label_to_label.yml @@ -16,11 +16,14 @@ then: - "module: pt2-dispatcher" - any: +<<<<<<< HEAD - "vllm-compile" then: - "module: vllm" - "oncall: pt2" - any: +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - "module: vmap" then: - "module: functorch" @@ -33,6 +36,13 @@ then: - "module: dynamo" - any: +<<<<<<< HEAD +======= + - "module: flex attention" + then: + - "module: higher order operators" +- any: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - "module: aotinductor" then: - "oncall: export" @@ -49,6 +59,7 @@ - "module: dynamic shapes" then: - "oncall: pt2" +<<<<<<< HEAD - any: - "release notes: distributed (c10d)" - "release notes: distributed (symm_mem)" @@ -58,3 +69,5 @@ - "oncall: distributed" then: - "ciflow/h100-distributed" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/labeler.yml b/.github/labeler.yml index 7b47b9fefb5dc..347fff90afbde 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -130,6 +130,7 @@ - torch/csrc/inductor/aoti_include/** - torchgen/aoti/** - torchgen/gen_aoti_c_shim.py +<<<<<<< HEAD "ciflow/vllm": - .github/ci_commit_pins/vllm.txt @@ -162,3 +163,5 @@ - torch/_inductor/kernel/mm.py - test/inductor/test_max_autotune.py - third_party/fbgemm +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml index 4ad15ecc7f8ba..576691f1e51be 100644 --- a/.github/merge_rules.yaml +++ b/.github/merge_rules.yaml @@ -76,7 +76,10 @@ - .github/ci_commit_pins/audio.txt - .github/ci_commit_pins/vision.txt - .github/ci_commit_pins/torchdynamo.txt +<<<<<<< HEAD - .github/ci_commit_pins/vllm.txt +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - .ci/docker/ci_commit_pins/triton.txt approved_by: - pytorchbot @@ -131,6 +134,24 @@ - Lint - pull +<<<<<<< HEAD +======= +- name: Mobile + patterns: + - ios/** + - android/** + - test/mobile/** + approved_by: + - linbinyu + - IvanKobzarev + - dreiss + - raziel + mandatory_checks_name: + - EasyCLA + - Lint + - pull + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: PrimTorch patterns: - torch/_meta_registrations.py @@ -370,7 +391,10 @@ - leslie-fang-intel - jgong5 - EikanWang +<<<<<<< HEAD - CaoE +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mandatory_checks_name: - EasyCLA - Lint @@ -422,7 +446,10 @@ approved_by: - leslie-fang-intel - jgong5 +<<<<<<< HEAD - CaoE +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mandatory_checks_name: - EasyCLA - Lint @@ -477,6 +504,7 @@ - srossross - chillee - zou3519 +<<<<<<< HEAD - guilhermeleobas mandatory_checks_name: - EasyCLA @@ -494,6 +522,8 @@ - test/inductor_skips/** approved_by: - guilhermeleobas +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mandatory_checks_name: - EasyCLA - Lint @@ -525,6 +555,7 @@ - Lint - pull +<<<<<<< HEAD - name: typechecking patterns: - 'pyrefly.toml' @@ -560,6 +591,8 @@ - Lint - pull +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: superuser patterns: - '*' diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 74b0d243859a2..a0500b3b89da5 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,6 +1,7 @@ tracking_issue: 24422 ciflow_tracking_issue: 64124 ciflow_push_tags: +<<<<<<< HEAD - ciflow/b200 - ciflow/b200-symm-mem - ciflow/b200-distributed @@ -43,6 +44,37 @@ ciflow_push_tags: - ciflow/vllm - ciflow/win-arm64 - ciflow/xpu +======= +- ciflow/binaries +- ciflow/binaries_libtorch +- ciflow/binaries_wheel +- ciflow/inductor +- ciflow/inductor-periodic +- ciflow/inductor-rocm +- ciflow/inductor-perf-test-nightly-rocm +- ciflow/inductor-perf-compare +- ciflow/inductor-micro-benchmark +- ciflow/inductor-micro-benchmark-cpu-x86 +- ciflow/inductor-perf-test-nightly-x86-zen +- ciflow/inductor-cu126 +- ciflow/linux-aarch64 +- ciflow/mps +- ciflow/nightly +- ciflow/periodic +- ciflow/periodic-rocm-mi300 +- ciflow/rocm +- ciflow/rocm-mi300 +- ciflow/s390 +- ciflow/slow +- ciflow/trunk +- ciflow/unstable +- ciflow/xpu +- ciflow/torchbench +- ciflow/op-benchmark +- ciflow/pull +- ciflow/h100 +- ciflow/h100-distributed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retryable_workflows: - pull - trunk @@ -51,4 +83,8 @@ retryable_workflows: - inductor-A100-perf-nightly labeler_config: labeler.yml label_to_label_config: label_to_label.yml +<<<<<<< HEAD mergebot: true +======= +mergebot: True +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt index c274ca1e5914d..e4085fcc6adbb 100644 --- a/.github/requirements-gha-cache.txt +++ b/.github/requirements-gha-cache.txt @@ -1,15 +1,28 @@ # This file is to cache other dependencies not specified elsewhere in: +<<<<<<< HEAD # requirements.txt # requirements-build.txt +======= +# requirement.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # docs/requirements.txt # docs/cpp/requirements.txt # functorch/docs/requirements.txt # .ci/docker/requirements-ci.txt boto3==1.35.42 jinja2==3.1.6 +<<<<<<< HEAD lintrunner==0.12.7 ninja==1.10.0.post1 nvidia-ml-py==11.525.84 pyyaml==6.0.2 requests==2.32.4 rich==14.1.0 +======= +lintrunner==0.10.7 +ninja==1.10.0.post1 +nvidia-ml-py==11.525.84 +pyyaml==6.0 +requests==2.32.4 +rich==10.9.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64 new file mode 100644 index 0000000000000..b6e9a6ce9f3e5 --- /dev/null +++ b/.github/requirements/conda-env-macOS-ARM64 @@ -0,0 +1,5 @@ +# Not pinning certifi so that we can always get the latest certificates +certifi +pip=23.2.1 +pkg-config=0.29.2 +wheel=0.37.1 diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt new file mode 100644 index 0000000000000..e8464f0a55ff5 --- /dev/null +++ b/.github/requirements/pip-requirements-macOS.txt @@ -0,0 +1,36 @@ +boto3==1.35.42 +cmake==3.27.* +expecttest==0.3.0 +fbscribelogger==0.1.7 +filelock==3.6.0 +hypothesis==6.56.4 +librosa>=0.6.2 +mpmath==1.3.0 +networkx==2.8.7 +ninja==1.10.2.4 +numba==0.59.0 +numpy==1.26.4 +opt-einsum>=3.3 +optree==0.13.0 +packaging==23.1 +parameterized==0.8.1 +pillow==10.3.0 +protobuf==5.29.4 +psutil==5.9.1 +pygments==2.15.0 +pytest-cpp==2.3.0 +pytest-flakefinder==1.1.0 +pytest-rerunfailures==10.3 +pytest-subtests==0.13.1 +pytest-xdist==3.3.1 +pytest==7.3.2 +pyyaml==6.0.2 +scipy==1.12.0 +setuptools==72.1.0 +sympy==1.13.3 +tlparse==0.3.30 +tensorboard==2.13.0 +typing-extensions==4.12.2 +unittest-xml-reporting<=3.2.0,>=2.0.0 +xdoctest==1.1.0 +z3-solver==4.12.2.0 diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index e541e7a86f653..040fa12f368a6 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -57,7 +57,10 @@ def get_rocm_version() -> str: rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h" if not os.path.isfile(rocm_version_h): rocm_version_h = f"{rocm_path}/include/rocm_version.h" +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install. if os.path.isfile(rocm_version_h): RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)") @@ -90,23 +93,38 @@ def build_triton( if "MAX_JOBS" not in env: max_jobs = os.cpu_count() or 1 env["MAX_JOBS"] = str(max_jobs) +<<<<<<< HEAD version_suffix = "" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if not release: # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8 # while release build should only include the version, i.e. 2.1.0 rocm_version = get_rocm_version() version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}" version += version_suffix +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with TemporaryDirectory() as tmpdir: triton_basedir = Path(tmpdir) / "triton" triton_pythondir = triton_basedir / "python" triton_repo = "https://github.com/openai/triton" if device == "rocm": +<<<<<<< HEAD triton_pkg_name = "triton" triton_repo = "https://github.com/ROCm/triton" +======= + triton_repo = "https://github.com/ROCm/triton" + rocm_version = get_rocm_version() # e.g., "7.0.1" + if tuple(map(int, rocm_version.split("."))) > (7, 0, 0): + triton_pkg_name = "triton" + else: + triton_pkg_name = "pytorch-triton-rocm" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif device == "xpu": triton_pkg_name = "pytorch-triton-xpu" triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton" @@ -119,7 +137,10 @@ def build_triton( ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir ) else: +<<<<<<< HEAD check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) check_call(["git", "checkout", commit_hash], cwd=triton_basedir) # change built wheel name and version @@ -163,6 +184,7 @@ def build_triton( cwd=triton_basedir, ) +<<<<<<< HEAD # For gpt-oss models, triton requires this extra triton_kernels wheel # triton_kernels came after pytorch release/2.8 triton_kernels_dir = Path(f"{triton_basedir}/python/triton_kernels") @@ -170,6 +192,8 @@ def build_triton( kernels_whl_path = next(iter((triton_kernels_dir / "dist").glob("*.whl"))) shutil.copy(kernels_whl_path, Path.cwd()) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return Path.cwd() / whl_path.name diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py index 8032008edf122..63a82ca1b3dd5 100644 --- a/.github/scripts/delete_old_branches.py +++ b/.github/scripts/delete_old_branches.py @@ -275,7 +275,11 @@ def delete_branches() -> None: delete_branch(git_repo, branch) +<<<<<<< HEAD def delete_old_tags() -> None: +======= +def delete_old_ciflow_tags() -> None: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Deletes ciflow tags if they are associated with a closed PR or a specific # commit. Lightweight tags don't have information about the date they were # created, so we can't check how old they are. The script just assumes that @@ -288,14 +292,20 @@ def delete_tag(tag: str) -> None: delete_branch(git_repo, f"refs/tags/{tag}") tags = git_repo._run_git("tag").splitlines() +<<<<<<< HEAD CIFLOW_TAG_REGEX = re.compile(r"^ciflow\/.*\/(\d{5,6}|[0-9a-f]{40})$") AUTO_REVERT_TAG_REGEX = re.compile(r"^trunk\/[0-9a-f]{40}$") +======= + open_pr_numbers = [x["number"] for x in get_open_prs()] + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for tag in tags: try: if ESTIMATED_TOKENS[0] > 400: print("Estimated tokens exceeded, exiting") break +<<<<<<< HEAD if not CIFLOW_TAG_REGEX.match(tag) and not AUTO_REVERT_TAG_REGEX.match(tag): continue @@ -311,6 +321,18 @@ def delete_tag(tag: str) -> None: if tag_age_days > 7: print(f"[{tag}] Tag is older than 7 days, deleting") +======= + if not tag.startswith("ciflow/"): + continue + re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag) + re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag) + if re_match_pr: + pr_number = int(re_match_pr.group(1)) + if pr_number in open_pr_numbers: + continue + delete_tag(tag) + elif re_match_sha: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) delete_tag(tag) except Exception as e: print(f"Failed to check tag {tag}: {e}") @@ -318,4 +340,8 @@ def delete_tag(tag: str) -> None: if __name__ == "__main__": delete_branches() +<<<<<<< HEAD delete_old_tags() +======= + delete_old_ciflow_tags() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py index 04f4707a55c3f..bc81f319bf997 100644 --- a/.github/scripts/docathon-label-sync.py +++ b/.github/scripts/docathon-label-sync.py @@ -39,9 +39,13 @@ def main() -> None: pull_request_label_names = [label.name for label in pull_request_labels] issue_label_names = [label.name for label in issue_labels] labels_to_add = [ +<<<<<<< HEAD label for label in issue_label_names if label not in pull_request_label_names and label != "actionable" +======= + label for label in issue_label_names if label not in pull_request_label_names +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] if not labels_to_add: print("The pull request already has the same labels.") diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 592c7aab6d933..c8419a7631887 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -18,7 +18,10 @@ REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)" +<<<<<<< HEAD MAIN_BRANCH = "main" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PREFIX = "test-config/" @@ -41,9 +44,15 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool: } # The link to the published list of disabled jobs +<<<<<<< HEAD DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json" # and unstable jobs UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json" +======= +DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=HnkH0xQWnnsoeMsSIVf9291NE5c4jWSa" +# and unstable jobs +UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=iP_F8gBs60PfOMAJ8gnn1paVrzM1WYsK" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Some constants used to handle disabled and unstable jobs JOB_NAME_SEP = "/" @@ -98,7 +107,11 @@ def parse_args() -> Any: parser.add_argument( "--branch", type=str, +<<<<<<< HEAD default=MAIN_BRANCH, +======= + default="main", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) help="the branch name", ) return parser.parse_args() @@ -457,7 +470,10 @@ def download_json(url: str, headers: dict[str, str], num_retries: int = 3) -> An def set_output(name: str, val: Any) -> None: +<<<<<<< HEAD print(f"Setting output {name}={val}") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if os.getenv("GITHUB_OUTPUT"): with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: print(f"{name}={val}", file=env) @@ -497,17 +513,22 @@ def check_for_setting(labels: set[str], body: str, setting: str) -> bool: def perform_misc_tasks( +<<<<<<< HEAD labels: set[str], test_matrix: dict[str, list[Any]], job_name: str, pr_body: str, branch: Optional[str] = None, tag: Optional[str] = None, +======= + labels: set[str], test_matrix: dict[str, list[Any]], job_name: str, pr_body: str +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) -> None: """ In addition to apply the filter logic, the script also does the following misc tasks to set keep-going and is-unstable variables """ +<<<<<<< HEAD set_output( "keep-going", branch == MAIN_BRANCH @@ -516,6 +537,9 @@ def perform_misc_tasks( or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag)) or check_for_setting(labels, pr_body, "keep-going"), ) +======= + set_output("keep-going", check_for_setting(labels, pr_body, "keep-going")) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set_output( "ci-verbose-test-logs", check_for_setting(labels, pr_body, "ci-verbose-test-logs"), @@ -638,8 +662,11 @@ def main() -> None: test_matrix=filtered_test_matrix, job_name=args.job_name, pr_body=pr_body if pr_body else "", +<<<<<<< HEAD branch=args.branch, tag=tag, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) # Set the filtered test matrix as the output diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index fd04922f39999..c42d3e41a0a53 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -16,23 +16,37 @@ # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this +<<<<<<< HEAD CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"] +======= +CUDA_ARCHES = ["12.6", "12.8", "12.9"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDA_STABLE = "12.8" CUDA_ARCHES_FULL_VERSION = { "12.6": "12.6.3", "12.8": "12.8.1", "12.9": "12.9.1", +<<<<<<< HEAD "13.0": "13.0.2", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } CUDA_ARCHES_CUDNN_VERSION = { "12.6": "9", "12.8": "9", "12.9": "9", +<<<<<<< HEAD "13.0": "9", } # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this ROCM_ARCHES = ["6.4", "7.0"] +======= +} + +# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this +ROCM_ARCHES = ["6.3", "6.4"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) XPU_ARCHES = ["xpu"] @@ -40,11 +54,16 @@ CPU_S390X_ARCH = ["cpu-s390x"] +<<<<<<< HEAD CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"] +======= +CUDA_AARCH64_ARCHES = ["12.9-aarch64"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { "12.6": ( +<<<<<<< HEAD "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | " @@ -133,6 +152,76 @@ "tcmlib==1.4.0 | " "umf==0.11.0 | " "intel-pti==0.13.1" +======= + "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + "12.8": ( + "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + "12.9": ( + "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + "xpu": ( + "intel-cmplr-lib-rt==2025.1.1 | " + "intel-cmplr-lib-ur==2025.1.1 | " + "intel-cmplr-lic-rt==2025.1.1 | " + "intel-sycl-rt==2025.1.1 | " + "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "onemkl-sycl-blas==2025.1.0 | " + "onemkl-sycl-dft==2025.1.0 | " + "onemkl-sycl-lapack==2025.1.0 | " + "onemkl-sycl-rng==2025.1.0 | " + "onemkl-sycl-sparse==2025.1.0 | " + "dpcpp-cpp-rt==2025.1.1 | " + "intel-opencl-rt==2025.1.1 | " + "mkl==2025.1.0 | " + "intel-openmp==2025.1.1 | " + "tbb==2022.1.0 | " + "tcmlib==1.3.0 | " + "umf==0.10.0 | " + "intel-pti==0.12.3" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ), } @@ -143,7 +232,13 @@ def get_nccl_wheel_version(arch_version: str) -> str: requirements = map( str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]) ) +<<<<<<< HEAD return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1] +======= + return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[ + 1 + ] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def read_nccl_pin(arch_version: str) -> str: @@ -210,7 +305,11 @@ def arch_type(arch_version: str) -> str: "cpu": "libtorch-cxx11-builder:cpu", } +<<<<<<< HEAD FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"] +======= +FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str: @@ -241,11 +340,15 @@ def generate_libtorch_matrix( arches += CUDA_ARCHES arches += ROCM_ARCHES elif os == "windows": +<<<<<<< HEAD # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up # in 2.10 windows_cuda_arches = CUDA_ARCHES.copy() windows_cuda_arches.remove("12.9") arches += windows_cuda_arches +======= + arches += CUDA_ARCHES +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if libtorch_variants is None: libtorch_variants = [ "shared-with-deps", @@ -294,6 +397,10 @@ def generate_wheels_matrix( os: str, arches: Optional[list[str]] = None, python_versions: Optional[list[str]] = None, +<<<<<<< HEAD +======= + use_split_build: bool = False, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) -> list[dict[str, str]]: package_type = "wheel" if os == "linux" or os == "linux-aarch64" or os == "linux-s390x": @@ -309,11 +416,15 @@ def generate_wheels_matrix( if os == "linux": arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES elif os == "windows": +<<<<<<< HEAD # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up # in 2.10 windows_cuda_arches = CUDA_ARCHES.copy() windows_cuda_arches.remove("12.9") arches += windows_cuda_arches + XPU_ARCHES +======= + arches += CUDA_ARCHES + XPU_ARCHES +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) elif os == "linux-aarch64": # Separate new if as the CPU type is different and # uses different build/test scripts @@ -336,6 +447,7 @@ def generate_wheels_matrix( else arch_version ) +<<<<<<< HEAD # TODO: Enable python 3.14 for rest if os not in [ "linux", @@ -350,6 +462,25 @@ def generate_wheels_matrix( if ( arch_version in ["13.0", "12.9", "12.8", "12.6"] +======= + # TODO: Enable python 3.13t on cpu-s390x + if gpu_arch_type == "cpu-s390x" and python_version == "3.13t": + continue + + if use_split_build and ( + arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux" + ): + raise RuntimeError( + "Split build is only supported on linux with cuda 12* and cpu.\n" + f"Currently attempting to build on arch version {arch_version} and os {os}.\n" + "Please modify the matrix generation to exclude this combination." + ) + + # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install + + if ( + arch_version in ["12.9", "12.8", "12.6"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) and os == "linux" or arch_version in CUDA_AARCH64_ARCHES ): @@ -360,6 +491,10 @@ def generate_wheels_matrix( "gpu_arch_type": gpu_arch_type, "gpu_arch_version": gpu_arch_version, "desired_cuda": desired_cuda, +<<<<<<< HEAD +======= + "use_split_build": "True" if use_split_build else "False", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split( ":" )[0], @@ -382,6 +517,33 @@ def generate_wheels_matrix( ), # include special case for aarch64 build, remove the -aarch64 postfix } ) +<<<<<<< HEAD +======= + # Special build building to use on Colab. Python 3.11 for 12.6 CUDA + if python_version == "3.11" and arch_version == CUDA_STABLE: + ret.append( + { + "python_version": python_version, + "gpu_arch_type": gpu_arch_type, + "gpu_arch_version": gpu_arch_version, + "desired_cuda": translate_desired_cuda( + gpu_arch_type, gpu_arch_version + ), + "use_split_build": "True" if use_split_build else "False", + "container_image": WHEEL_CONTAINER_IMAGES[ + arch_version + ].split(":")[0], + "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[ + arch_version + ].split(":")[1], + "package_type": package_type, + "pytorch_extra_install_requirements": "", + "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950 + ".", "_" + ), + } + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else: ret.append( { @@ -391,6 +553,10 @@ def generate_wheels_matrix( "desired_cuda": translate_desired_cuda( gpu_arch_type, gpu_arch_version ), +<<<<<<< HEAD +======= + "use_split_build": "True" if use_split_build else "False", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split( ":" )[0], @@ -412,7 +578,10 @@ def generate_wheels_matrix( return ret +<<<<<<< HEAD validate_nccl_dep_consistency("13.0") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) validate_nccl_dep_consistency("12.9") validate_nccl_dep_consistency("12.8") validate_nccl_dep_consistency("12.6") diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 7d22e5059b7cb..f121486890d22 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -22,7 +22,10 @@ LABEL_CIFLOW_PERIODIC = "ciflow/periodic" LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch" LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel" +<<<<<<< HEAD LABEL_CIFLOW_ROCM = "ciflow/rocm" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) @dataclass @@ -59,7 +62,13 @@ class BinaryBuildWorkflow: is_scheduled: str = "" branches: str = "nightly" # Mainly for macos +<<<<<<< HEAD macos_runner: str = "macos-14-xlarge" +======= + cross_compile_arm64: bool = False + macos_runner: str = "macos-14-xlarge" + use_split_build: bool = False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Mainly used for libtorch builds build_variant: str = "" @@ -70,6 +79,12 @@ def __post_init__(self) -> None: for item in [self.os, "binary", self.package_type, self.build_variant] if item != "" ) +<<<<<<< HEAD +======= + if self.use_split_build: + # added to distinguish concurrency groups + self.build_environment += "-split" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: output_file_path = ( @@ -112,6 +127,24 @@ class OperatingSystem: isolated_workflow=True, ), ), +<<<<<<< HEAD +======= + # See https://github.com/pytorch/pytorch/issues/138750 + # BinaryBuildWorkflow( + # os=OperatingSystem.LINUX, + # package_type="manywheel", + # build_configs=generate_binary_build_matrix.generate_wheels_matrix( + # OperatingSystem.LINUX, + # use_split_build=True, + # arches=["11.8", "12.1", "12.4", "cpu"], + # ), + # ciflow_config=CIFlowConfig( + # labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + # isolated_workflow=True, + # ), + # use_split_build=True, + # ), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) BinaryBuildWorkflow( os=OperatingSystem.LINUX, package_type="libtorch", @@ -127,6 +160,50 @@ class OperatingSystem: ), ] +<<<<<<< HEAD +======= +LINUX_BINARY_SMOKE_WORKFLOWS = [ + BinaryBuildWorkflow( + os=OperatingSystem.LINUX, + package_type="manywheel", + build_configs=generate_binary_build_matrix.generate_wheels_matrix( + OperatingSystem.LINUX, + arches=["12.6", "12.8", "12.9", "6.4"], + python_versions=["3.9"], + ), + branches="main", + ), + # See https://github.com/pytorch/pytorch/issues/138750 + # BinaryBuildWorkflow( + # os=OperatingSystem.LINUX, + # package_type="manywheel", + # build_configs=generate_binary_build_matrix.generate_wheels_matrix( + # OperatingSystem.LINUX, + # arches=["11.8", "12.1", "12.4"], + # python_versions=["3.9"], + # use_split_build=True, + # ), + # ciflow_config=CIFlowConfig( + # labels={LABEL_CIFLOW_PERIODIC}, + # ), + # branches="main", + # use_split_build=True, + # ), + BinaryBuildWorkflow( + os=OperatingSystem.LINUX, + package_type="libtorch", + build_variant=generate_binary_build_matrix.RELEASE, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.LINUX, + generate_binary_build_matrix.RELEASE, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="main", + ), +] + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) WINDOWS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, @@ -212,6 +289,42 @@ class OperatingSystem: ), ] +<<<<<<< HEAD +======= +WINDOWS_BINARY_SMOKE_WORKFLOWS = [ + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS, + package_type="libtorch", + build_variant=generate_binary_build_matrix.RELEASE, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.WINDOWS, + generate_binary_build_matrix.RELEASE, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="main", + ciflow_config=CIFlowConfig( + isolated_workflow=True, + ), + ), + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS, + package_type="libtorch", + build_variant=generate_binary_build_matrix.DEBUG, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.WINDOWS, + generate_binary_build_matrix.DEBUG, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="main", + ciflow_config=CIFlowConfig( + isolated_workflow=True, + ), + ), +] + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) MACOS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.MACOS_ARM64, @@ -222,6 +335,10 @@ class OperatingSystem: generate_binary_build_matrix.RELEASE, libtorch_variants=["shared-with-deps"], ), +<<<<<<< HEAD +======= + cross_compile_arm64=False, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) macos_runner="macos-14-xlarge", ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, @@ -234,6 +351,10 @@ class OperatingSystem: build_configs=generate_binary_build_matrix.generate_wheels_matrix( OperatingSystem.MACOS_ARM64 ), +<<<<<<< HEAD +======= + cross_compile_arm64=False, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) macos_runner="macos-14-xlarge", ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, @@ -293,10 +414,24 @@ def main() -> None: S390X_BINARY_BUILD_WORKFLOWS, ), ( +<<<<<<< HEAD +======= + jinja_env.get_template("linux_binary_build_workflow.yml.j2"), + LINUX_BINARY_SMOKE_WORKFLOWS, + ), + ( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jinja_env.get_template("windows_binary_build_workflow.yml.j2"), WINDOWS_BINARY_BUILD_WORKFLOWS, ), ( +<<<<<<< HEAD +======= + jinja_env.get_template("windows_binary_build_workflow.yml.j2"), + WINDOWS_BINARY_SMOKE_WORKFLOWS, + ), + ( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jinja_env.get_template("macos_binary_build_workflow.yml.j2"), MACOS_BINARY_BUILD_WORKFLOWS, ), diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py index b04cbed76e955..bf8e669531096 100644 --- a/.github/scripts/get_workflow_job_id.py +++ b/.github/scripts/get_workflow_job_id.py @@ -136,10 +136,17 @@ def find_job_id_name(args: Any) -> tuple[str, str]: def set_output(name: str, val: Any) -> None: +<<<<<<< HEAD print(f"Setting output {name}={val}") if os.getenv("GITHUB_OUTPUT"): with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: print(f"{name}={val}", file=env) +======= + if os.getenv("GITHUB_OUTPUT"): + with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: + print(f"{name}={val}", file=env) + print(f"setting {name}={val}") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else: print(f"::set-output name={name}::{val}") diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 110015988a5c3..4bd7228b9298c 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -18,7 +18,10 @@ class GitHubComment: body_text: str created_at: str author_login: str +<<<<<<< HEAD author_url: Optional[str] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) author_association: str editor_login: Optional[str] database_id: int diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh index b353617a45b2b..cd04147193c63 100755 --- a/.github/scripts/lintrunner.sh +++ b/.github/scripts/lintrunner.sh @@ -2,7 +2,11 @@ set -ex # Use uv to speed up lintrunner init +<<<<<<< HEAD python3 -m pip install -U uv==0.8.* setuptools +======= +python3 -m pip install uv==0.1.45 setuptools +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CACHE_DIRECTORY="/tmp/.lintbin" # Try to recover the cached binaries diff --git a/.github/scripts/parse_ref.py b/.github/scripts/parse_ref.py index e821750a49e10..05433caa11efa 100755 --- a/.github/scripts/parse_ref.py +++ b/.github/scripts/parse_ref.py @@ -5,7 +5,10 @@ def set_output(name: str, val: str) -> None: +<<<<<<< HEAD print(f"Setting output {name}={val}") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if os.getenv("GITHUB_OUTPUT"): with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: print(f"{name}={val}", file=env) diff --git a/.github/scripts/runner_determinator.py b/.github/scripts/runner_determinator.py index baf560234549b..9af3be41dd65b 100644 --- a/.github/scripts/runner_determinator.py +++ b/.github/scripts/runner_determinator.py @@ -262,12 +262,16 @@ def is_exception_branch(branch: str) -> bool: """ Branches that get opted out of experiments by default, until they're explicitly enabled. """ +<<<<<<< HEAD return branch.split("/", maxsplit=1)[0] in { "main", "nightly", "release", "landchecks", } +======= + return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def load_yaml(yaml_text: str) -> Any: diff --git a/.github/scripts/tag_docker_images_for_release.py b/.github/scripts/tag_docker_images_for_release.py new file mode 100644 index 0000000000000..b2bf474575f6f --- /dev/null +++ b/.github/scripts/tag_docker_images_for_release.py @@ -0,0 +1,64 @@ +import argparse +import subprocess + +import generate_binary_build_matrix + + +def tag_image( + image: str, + default_tag: str, + release_version: str, + dry_run: str, + tagged_images: dict[str, bool], +) -> None: + if image in tagged_images: + return + release_image = image.replace(f"-{default_tag}", f"-{release_version}") + print(f"Tagging {image} to {release_image} , dry_run: {dry_run}") + + if dry_run == "disabled": + subprocess.check_call(["docker", "pull", image]) + subprocess.check_call(["docker", "tag", image, release_image]) + subprocess.check_call(["docker", "push", release_image]) + tagged_images[image] = True + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--version", + help="Version to tag", + type=str, + default="2.2", + ) + parser.add_argument( + "--dry-run", + help="No Runtime Error check", + type=str, + choices=["enabled", "disabled"], + default="enabled", + ) + + options = parser.parse_args() + tagged_images: dict[str, bool] = {} + platform_images = [ + generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES, + generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES, + ] + default_tag = generate_binary_build_matrix.DEFAULT_TAG + + for platform_image in platform_images: # type: ignore[attr-defined] + for arch in platform_image.keys(): # type: ignore[attr-defined] + if arch == "cpu-s390x": + continue + tag_image( + platform_image[arch], # type: ignore[index] + default_tag, + options.version, + options.dry_run, + tagged_images, + ) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/td_llm_indexer.sh b/.github/scripts/td_llm_indexer.sh index cc8f363659ba6..834664fc00d24 100644 --- a/.github/scripts/td_llm_indexer.sh +++ b/.github/scripts/td_llm_indexer.sh @@ -6,7 +6,11 @@ set -euxo pipefail cd llm-target-determinator pip install -q -r requirements.txt cd ../codellama +<<<<<<< HEAD pip install --no-build-isolation -v -e . +======= +pip install -e . +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install numpy==1.26.0 # Run indexer diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py index 74ce276c9d10a..c699fad7346b7 100644 --- a/.github/scripts/test_check_labels.py +++ b/.github/scripts/test_check_labels.py @@ -38,7 +38,10 @@ def mock_get_comments() -> list[GitHubComment]: body_text="mock_body_text", created_at="", author_login="", +<<<<<<< HEAD author_url=None, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) author_association="", editor_login=None, database_id=1, @@ -49,7 +52,10 @@ def mock_get_comments() -> list[GitHubComment]: body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""), created_at="", author_login=BOT_AUTHORS[1], +<<<<<<< HEAD author_url=None, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) author_association="", editor_login=None, database_id=2, diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py index 790deb85ef8c3..4e414a934d2ae 100755 --- a/.github/scripts/test_trymerge.py +++ b/.github/scripts/test_trymerge.py @@ -27,17 +27,26 @@ get_drci_classifications, gh_get_team_members, GitHubPR, +<<<<<<< HEAD iter_issue_timeline_until_comment, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) JobCheckState, main as trymerge_main, MandatoryChecksMissingError, MergeRule, +<<<<<<< HEAD PostCommentError, RE_GHSTACK_DESC, read_merge_rules, remove_job_name_suffix, sha_from_committed_event, sha_from_force_push_after, +======= + RE_GHSTACK_DESC, + read_merge_rules, + remove_job_name_suffix, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) validate_revert, ) @@ -74,9 +83,12 @@ def save_mocked_queries(obj: Any) -> None: if key in mocked_queries: return mocked_queries[key] +<<<<<<< HEAD # TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved raise ValueError(f"Key {key} could not be found in gql_mocks") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) try: rc = fallback_function(*args) except HTTPError as err: @@ -128,7 +140,11 @@ def __init__(self) -> None: self.force = force self.pr_num = 76123 self.dry_run = True +<<<<<<< HEAD self.comment_id = 12345 # Set to non-zero value +======= + self.comment_id = 0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) self.reason = "this is for testing" self.ignore_current = False self.check_mergeability = False @@ -156,9 +172,15 @@ def mock_revert( def mock_merge( pr: GitHubPR, repo: GitRepo, +<<<<<<< HEAD comment_id: int, dry_run: bool = False, skip_mandatory_checks: bool = False, +======= + dry_run: bool = False, + skip_mandatory_checks: bool = False, + comment_id: Optional[int] = None, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout_minutes: int = 400, stale_pr_days: int = 3, ignore_current: bool = False, @@ -474,9 +496,15 @@ def test_main_force( mock_merge.assert_called_once_with( mock.ANY, mock.ANY, +<<<<<<< HEAD comment_id=mock.ANY, dry_run=mock.ANY, skip_mandatory_checks=True, +======= + dry_run=mock.ANY, + skip_mandatory_checks=True, + comment_id=mock.ANY, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ignore_current=False, ) @@ -489,9 +517,15 @@ def test_main_merge(self, mock_merge: Any, *args: Any) -> None: mock_merge.assert_called_once_with( mock.ANY, mock.ANY, +<<<<<<< HEAD comment_id=mock.ANY, dry_run=mock.ANY, skip_mandatory_checks=False, +======= + dry_run=mock.ANY, + skip_mandatory_checks=False, + comment_id=mock.ANY, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ignore_current=False, ) @@ -589,6 +623,7 @@ def test_get_merge_base(self, *args: Any) -> None: self.assertEqual(mock_merge_base, pr.get_merge_base()) mocked_gh_fetch_merge_base.assert_called_once() +<<<<<<< HEAD def test_app_can_revert(self, *args: Any) -> None: pr = GitHubPR("pytorch", "pytorch", 164660) repo = DummyGitRepo() @@ -606,6 +641,8 @@ def test_app_can_revert(self, *args: Any) -> None: "pytorch-auto-revert", ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) @mock.patch("trymerge.gh_fetch_merge_base", return_value="") @@ -1159,6 +1196,7 @@ def test__revlist_to_prs_two_prs( ) +<<<<<<< HEAD @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) @mock.patch("trymerge.gh_fetch_merge_base", return_value="") @mock.patch( @@ -1330,5 +1368,7 @@ def test_get_commit_sha_at_comment_exception( self.assertIsNone(sha) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if __name__ == "__main__": main() diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index c258284a00d83..bb63f20e87076 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -108,6 +108,13 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): fragment PRCheckSuites on CheckSuiteConnection { edges { node { +<<<<<<< HEAD +======= + app { + name + databaseId + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflowRun { workflow { name @@ -234,7 +241,10 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): createdAt author { login +<<<<<<< HEAD url +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } authorAssociation editor { @@ -451,6 +461,7 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): IGNORABLE_FAILED_CHECKS_THESHOLD = 10 +<<<<<<< HEAD def iter_issue_timeline_until_comment( org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200 ) -> Any: @@ -508,6 +519,8 @@ def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]: return ev.get("after_sha") or ev.get("head_sha") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any: rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no) return rc["data"]["repository"]["pullRequest"] @@ -795,6 +808,7 @@ def get_changed_files_count(self) -> int: def last_commit(self) -> Any: return self.info["commits"]["nodes"][-1]["commit"] +<<<<<<< HEAD def last_commit_sha(self, default: Optional[str] = None) -> str: # for commits, the oid is the sha @@ -803,16 +817,26 @@ def last_commit_sha(self, default: Optional[str] = None) -> str: return str(self.last_commit().get("oid", default)) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def get_merge_base(self) -> str: if self.merge_base: return self.merge_base +<<<<<<< HEAD last_commit_sha = self.last_commit_sha() +======= + last_commit_oid = self.last_commit()["oid"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: We could use self.base_ref() here for regular PR, however, that doesn't # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base, # so let's just use main instead self.merge_base = gh_fetch_merge_base( +<<<<<<< HEAD self.org, self.project, last_commit_sha, self.default_branch() +======= + self.org, self.project, last_commit_oid, self.default_branch() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid @@ -901,6 +925,7 @@ def get_approved_by(self) -> list[str]: def get_commit_count(self) -> int: return int(self.info["commits_with_authors"]["totalCount"]) +<<<<<<< HEAD def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]: """ Get the PR head commit SHA that was present when a specific comment was posted. @@ -939,6 +964,8 @@ def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]: print(f"Did not find comment with id {comment_id} in the PR timeline") return None +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) def get_pr_creator_login(self) -> str: return cast(str, self.info["author"]["login"]) @@ -1092,9 +1119,14 @@ def _comment_from_node(node: Any) -> GitHubComment: editor = node["editor"] return GitHubComment( body_text=node["bodyText"], +<<<<<<< HEAD created_at=node.get("createdAt", ""), author_login=node["author"]["login"], author_url=node["author"].get("url", None), +======= + created_at=node["createdAt"] if "createdAt" in node else "", + author_login=node["author"]["login"], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) author_association=node["authorAssociation"], editor_login=editor["login"] if editor else None, database_id=node["databaseId"], @@ -1256,7 +1288,11 @@ def merge_into( *, skip_mandatory_checks: bool = False, dry_run: bool = False, +<<<<<<< HEAD comment_id: int, +======= + comment_id: Optional[int] = None, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ignore_current_checks: Optional[list[str]] = None, ) -> None: # Raises exception if matching rule is not found @@ -1272,7 +1308,11 @@ def merge_into( skip_internal_checks=can_skip_internal_checks(self, comment_id), ignore_current_checks=ignore_current_checks, ) +<<<<<<< HEAD additional_merged_prs = self.merge_changes_locally( +======= + additional_merged_prs = self.merge_changes( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) repo, skip_mandatory_checks, comment_id ) @@ -1301,7 +1341,11 @@ def merge_into( broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []), flaky_checks=ignorable_checks.get("FLAKY", []), unstable_checks=ignorable_checks.get("UNSTABLE", []), +<<<<<<< HEAD last_commit_sha=self.last_commit_sha(default=""), +======= + last_commit_sha=self.last_commit().get("oid", ""), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) merge_base_sha=self.get_merge_base(), merge_commit_sha=merge_commit_sha, is_failed=False, @@ -1322,7 +1366,11 @@ def merge_into( dry_run=dry_run, ) +<<<<<<< HEAD def merge_changes_locally( +======= + def merge_changes( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) self, repo: GitRepo, skip_mandatory_checks: bool = False, @@ -1331,15 +1379,38 @@ def merge_changes_locally( skip_all_rule_checks: bool = False, ) -> list["GitHubPR"]: """ +<<<<<<< HEAD :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally +======= + :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) """ branch_to_merge_into = self.default_branch() if branch is None else branch if repo.current_branch() != branch_to_merge_into: repo.checkout(branch_to_merge_into) +<<<<<<< HEAD # It's okay to skip the commit SHA check for ghstack PRs since # authoring requires write access to the repo. if self.is_ghstack_pr(): +======= + if not self.is_ghstack_pr(): + msg = self.gen_commit_message() + pr_branch_name = f"__pull-request-{self.pr_num}__init__" + repo.fetch(self.last_commit()["oid"], pr_branch_name) + repo._run_git("merge", "--squash", pr_branch_name) + repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg) + + # Did the PR change since we started the merge? + pulled_sha = repo.show_ref(pr_branch_name) + latest_pr_status = GitHubPR(self.org, self.project, self.pr_num) + if pulled_sha != latest_pr_status.last_commit()["oid"]: + raise RuntimeError( + "PR has been updated since CI checks last passed. Please rerun the merge command." + ) + return [] + else: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self.merge_ghstack_into( repo, skip_mandatory_checks, @@ -1347,6 +1418,7 @@ def merge_changes_locally( skip_all_rule_checks=skip_all_rule_checks, ) +<<<<<<< HEAD msg = self.gen_commit_message() pr_branch_name = f"__pull-request-{self.pr_num}__init__" @@ -1389,6 +1461,8 @@ def merge_changes_locally( ) return [] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class MergeRuleFailedError(RuntimeError): def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None: @@ -1593,7 +1667,11 @@ def find_matching_merge_rule( pending_checks = [] failed_checks = [] +<<<<<<< HEAD hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}" +======= + hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if len(failed_checks) > 0: if reject_reason_score < 30000: reject_reason_score = 30000 @@ -2022,26 +2100,40 @@ def validate_revert( else pr.get_comment_by_id(comment_id) ) if comment.editor_login is not None: +<<<<<<< HEAD raise PostCommentError( "Halting the revert as the revert comment has been edited." ) +======= + raise PostCommentError("Don't want to revert based on edited command") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) author_association = comment.author_association author_login = comment.author_login allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"] # For some reason, one can not be a member of private repo, only CONTRIBUTOR if pr.is_base_repo_private(): allowed_reverters.append("CONTRIBUTOR") +<<<<<<< HEAD # Special case the pytorch-auto-revert app, whose does not have association # But should be able to issue revert command if comment.author_url == "https://github.com/apps/pytorch-auto-revert": allowed_reverters.append("NONE") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if author_association not in allowed_reverters: raise PostCommentError( f"Will not revert as @{author_login} is not one of " f"[{', '.join(allowed_reverters)}], but instead is {author_association}." ) +<<<<<<< HEAD +======= + # Raises exception if matching rule is not found, but ignores all status checks + find_matching_merge_rule( + pr, repo, skip_mandatory_checks=True, skip_internal_checks=True + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) commit_sha = get_pr_commit_sha(repo, pr) return (author_login, commit_sha) @@ -2292,14 +2384,24 @@ def categorize_checks( def merge( pr: GitHubPR, repo: GitRepo, +<<<<<<< HEAD comment_id: int, dry_run: bool = False, skip_mandatory_checks: bool = False, +======= + dry_run: bool = False, + skip_mandatory_checks: bool = False, + comment_id: Optional[int] = None, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout_minutes: int = 400, stale_pr_days: int = 3, ignore_current: bool = False, ) -> None: +<<<<<<< HEAD initial_commit_sha = pr.last_commit_sha() +======= + initial_commit_sha = pr.last_commit()["oid"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}" print(f"Attempting merge of {initial_commit_sha} ({pr_link})") @@ -2370,7 +2472,11 @@ def merge( f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)" ) pr = GitHubPR(pr.org, pr.project, pr.pr_num) +<<<<<<< HEAD if initial_commit_sha != pr.last_commit_sha(): +======= + if initial_commit_sha != pr.last_commit()["oid"]: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) raise RuntimeError( "New commits were pushed while merging. Please rerun the merge command." ) @@ -2537,7 +2643,11 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: if args.check_mergeability: if pr.is_ghstack_pr(): get_ghstack_prs(repo, pr) # raises error if out of sync +<<<<<<< HEAD pr.merge_changes_locally( +======= + pr.merge_changes( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) repo, skip_mandatory_checks=True, skip_all_rule_checks=True, @@ -2552,6 +2662,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run) return try: +<<<<<<< HEAD # Ensure comment id is set, else fail if not args.comment_id: raise ValueError( @@ -2564,6 +2675,14 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: comment_id=args.comment_id, dry_run=args.dry_run, skip_mandatory_checks=args.force, +======= + merge( + pr, + repo, + dry_run=args.dry_run, + skip_mandatory_checks=args.force, + comment_id=args.comment_id, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ignore_current=args.ignore_current, ) except Exception as e: @@ -2585,7 +2704,11 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: broken_trunk_checks=[], flaky_checks=[], unstable_checks=[], +<<<<<<< HEAD last_commit_sha=pr.last_commit_sha(default=""), +======= + last_commit_sha=pr.last_commit().get("oid", ""), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) merge_base_sha=pr.get_merge_base(), is_failed=True, skip_mandatory_checks=args.force, diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat index 75c916ecdbef7..28977ee042ffc 100644 --- a/.github/scripts/windows/build_magma.bat +++ b/.github/scripts/windows/build_magma.bat @@ -17,7 +17,10 @@ if errorlevel 1 exit /b 1 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%" set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER% +<<<<<<< HEAD set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mkdir magma_cuda%CUVER_NODOT% cd magma_cuda%CUVER_NODOT% @@ -35,9 +38,12 @@ cd magma mkdir build && cd build set GPU_TARGET=All +<<<<<<< HEAD if "%CUVER_NODOT%" == "130" ( set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 ) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if "%CUVER_NODOT%" == "129" ( set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120 ) diff --git a/.github/scripts/windows/build_triton.bat b/.github/scripts/windows/build_triton.bat index d26dc8bf3b198..761d5cfbc962f 100644 --- a/.github/scripts/windows/build_triton.bat +++ b/.github/scripts/windows/build_triton.bat @@ -1,12 +1,30 @@ @echo on +<<<<<<< HEAD set DESIRED_PYTHON=%PY_VERS% call .ci/pytorch/windows/internal/install_python.bat :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480 %PYTHON_EXEC% -m pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4 +======= +set PYTHON_PREFIX=%PY_VERS:.=% +set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py% +call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat +:: Create a new conda environment +if "%PY_VERS%" == "3.13t" ( + call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13 +) else ( + call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS% +) +:: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480 +call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dir "%VC_INSTALL_PATH%" call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64 +<<<<<<< HEAD %PYTHON_EXEC% .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE% +======= +call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE% +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 064eea7592230..c62af9388e5fd 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -4,7 +4,11 @@ {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%} {%- set timeout_minutes = 240 -%} +<<<<<<< HEAD {%- set timeout_minutes_windows_binary = 360 -%} +======= +{%- set timeout_minutes_windows_binary = 300 -%} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- macro concurrency(build_environment) -%} concurrency: @@ -32,7 +36,11 @@ concurrency: {%- macro setup_ec2_windows() -%} !{{ display_ec2_information() }} - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index baff04967e3ae..4b6e2db12a7ca 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -56,7 +56,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -71,15 +75,22 @@ jobs: with:!{{ upload.binary_env_as_input(config) }} {%- if "aarch64" in build_environment %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.arm64.r7g.12xlarge.memory +======= + runs_on: linux.arm64.m7g.4xlarge.ephemeral +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: "arm64v8/alpine" {%- elif "s390x" in build_environment %} runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 +<<<<<<< HEAD {%- elif config["gpu_arch_type"] == "rocm" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" timeout-minutes: 300 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.24xlarge.ephemeral @@ -117,12 +128,21 @@ jobs: ALPINE_IMAGE: "docker.io/s390x/alpine" {%- elif config["gpu_arch_type"] == "rocm" %} runs_on: linux.rocm.gpu +<<<<<<< HEAD {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner {%- elif config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + {%- elif config["gpu_arch_type"] == "cuda" %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- else %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.4xlarge @@ -138,7 +158,11 @@ jobs: contents: read steps: - name: Setup XPU +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/setup-xpu@main +======= + uses: ./.github/actions/setup-xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -153,10 +177,17 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" +<<<<<<< HEAD !{{ common.checkout(deep_clone=False, directory="pytorch") }} - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: !{{ config["container_image"] }} @@ -164,7 +195,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -177,9 +212,12 @@ jobs: runs-on: linux.rocm.gpu.mi250 timeout-minutes: !{{ common.timeout_minutes }} !{{ upload.binary_env(config) }} +<<<<<<< HEAD permissions: id-token: write contents: read +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm @@ -188,7 +226,11 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" +<<<<<<< HEAD !{{ common.checkout(deep_clone=False, directory="pytorch") }} +======= + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" @@ -202,7 +244,11 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: !{{ config["container_image"] }} @@ -210,7 +256,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index ad5dd74972d0a..6ac18c7395fd7 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -22,6 +22,7 @@ name: !{{ build_environment }} echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" {%- endmacro %} +<<<<<<< HEAD {%- macro setup_python(py_ver) -%} - name: Setup Python uses: actions/setup-python@v6 @@ -31,6 +32,8 @@ name: !{{ build_environment }} freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }} {%- endmacro %} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) on: # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 push: @@ -56,6 +59,12 @@ env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} SKIP_ALL_TESTS: 0 +<<<<<<< HEAD +======= +{%- if cross_compile_arm64 %} + CROSS_COMPILE_ARM64: 1 +{% endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) !{{ common.concurrency(build_environment) }} jobs: @@ -70,6 +79,7 @@ jobs: {%- endif %} steps: !{{ set_runner_specific_vars() }} +<<<<<<< HEAD !{{ setup_python(config.get("python_version", "3.10")) }} !{{ common.checkout(deep_clone=False, directory="pytorch") }} - name: Populate binary env @@ -77,6 +87,30 @@ jobs: "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -93,6 +127,11 @@ jobs: {%- if config["package_type"] == "wheel" %} - name: Test PyTorch wheel run: | +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -102,10 +141,20 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" +<<<<<<< HEAD # shellcheck disable=SC2086 python -mvenv test_venv source test_venv/bin/activate +======= + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index 5e3798f8e2377..ae519cc9a7330 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -15,7 +15,11 @@ # favor of GPU_ARCH_VERSION DESIRED_CUDA: !{{ config["desired_cuda"] }} {%- if config["gpu_arch_version"] %} +<<<<<<< HEAD GPU_ARCH_VERSION: "!{{ config["gpu_arch_version"] }}" +======= + GPU_ARCH_VERSION: !{{ config["gpu_arch_version"] }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- endif %} GPU_ARCH_TYPE: !{{ config["gpu_arch_type"] }} {%- if include_skip_tests %} @@ -25,6 +29,14 @@ DOCKER_IMAGE: !{{ config["container_image"] }} DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }} {%- endif %} +<<<<<<< HEAD +======= +{%- if config["package_type"] == "manywheel" %} + {%- if config.use_split_build is defined %} + use_split_build: !{{ config["use_split_build"] }} + {%- endif %} +{%- endif %} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- if config["package_type"] == "libtorch" %} {%- if config["libtorch_config"] %} LIBTORCH_CONFIG: !{{ config["libtorch_config"] }} @@ -33,7 +45,11 @@ {%- if is_windows %} # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- endif %} {%- else %} diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2 index 34c148270c6bc..a566a3dde764f 100644 --- a/.github/templates/windows_binary_build_workflow.yml.j2 +++ b/.github/templates/windows_binary_build_workflow.yml.j2 @@ -64,7 +64,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -79,9 +83,15 @@ jobs: runs-on: "windows-11-arm64-preview" {%- else %} {%- if branches == "nightly" %} +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" {%- else %} runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral" +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + {%- else %} + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- endif %} {%- endif %} timeout-minutes: !{{ common.timeout_minutes_windows_binary }} @@ -135,7 +145,11 @@ jobs: {%- else %} !{{ set_runner_specific_vars() }} !{{ common.setup_ec2_windows() }} +<<<<<<< HEAD !{{ common.checkout(deep_clone=False, directory="pytorch") }} +======= + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {%- endif %} - name: Populate binary env shell: bash @@ -211,7 +225,11 @@ jobs: "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" {%- else %} !{{ common.setup_ec2_windows() }} +<<<<<<< HEAD !{{ common.checkout(deep_clone=False, directory="pytorch") }} +======= + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) !{{ set_runner_specific_vars() }} {%- endif %} - uses: !{{ common.download_artifact_action }} diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml index 72241a772be61..59a23462af5d2 100644 --- a/.github/workflows/_bazel-build-test.yml +++ b/.github/workflows/_bazel-build-test.yml @@ -47,7 +47,11 @@ jobs: reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false @@ -69,25 +73,41 @@ jobs: runs-on: ${{ matrix.runner }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup Linux uses: ./.github/actions/setup-linux - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image-name }} - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -97,7 +117,11 @@ jobs: run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@main +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} - name: Output disk space left @@ -209,5 +233,9 @@ jobs: file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index bfa035bc753b8..4bb6b0198d7d4 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -26,6 +26,16 @@ on: default: 240 type: number description: timeout for the job +<<<<<<< HEAD +======= + use_split_build: + description: | + [Experimental] Build a libtorch only wheel and build pytorch such that + are built from the libtorch wheel. + required: false + type: boolean + default: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: required: false type: string @@ -110,6 +120,10 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} PYTORCH_FINAL_PACKAGE_DIR: /artifacts SHA1: ${{ github.event.pull_request.head.sha || github.sha }} +<<<<<<< HEAD +======= + USE_SPLIT_BUILD: ${{ inputs.use_split_build }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Make the env permanent during this workflow (but not the secrets) shell: bash @@ -134,6 +148,10 @@ jobs: echo "PR_NUMBER=${{ env.PR_NUMBER }}" echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" echo "SHA1=${{ env.SHA1 }}" +<<<<<<< HEAD +======= + echo "USE_SPLIT_BUILD=${{ env.use_split_build }}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } >> "${GITHUB_ENV} }}" - name: List the env @@ -142,13 +160,21 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.github-token }} - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -178,7 +204,10 @@ jobs: - name: Checkout PyTorch to pytorch dir uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -213,9 +242,15 @@ jobs: - name: Calculate docker image id: calculate-docker-image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main with: # If doing this in main or release branch, use docker.io. Otherwise +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + # If doing this in release/2.8 or release branch, use docker.io. Otherwise +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # use ECR docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: ${{ inputs.DOCKER_IMAGE }} @@ -227,7 +262,11 @@ jobs: - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -252,6 +291,10 @@ jobs: -e PYTORCH_ROOT \ -e SKIP_ALL_TESTS \ -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \ +<<<<<<< HEAD +======= + -e USE_SPLIT_BUILD \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --tty \ --detach \ -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ @@ -283,7 +326,11 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 476dd182db0f8..5fdd16f8db0cb 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -64,6 +64,16 @@ on: required: true type: string description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu +<<<<<<< HEAD +======= + use_split_build: + description: | + [Experimental] Build a libtorch only wheel and build pytorch such that + are built from the libtorch wheel. + required: false + type: boolean + default: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: required: true @@ -97,6 +107,10 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} PYTORCH_FINAL_PACKAGE_DIR: /artifacts SHA1: ${{ github.event.pull_request.head.sha || github.sha }} +<<<<<<< HEAD +======= + USE_SPLIT_BUILD: ${{ inputs.use_split_build }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Make the env permanent during this workflow (but not the secrets) shell: bash @@ -121,18 +135,30 @@ jobs: echo "PR_NUMBER=${{ env.PR_NUMBER }}" echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" echo "SHA1=${{ env.SHA1 }}" +<<<<<<< HEAD +======= + echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } >> "${GITHUB_ENV} }}" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.github-token }} # Setup the environment - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -155,7 +181,10 @@ jobs: - name: Checkout PyTorch to pytorch dir uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive show-progress: false path: pytorch @@ -186,7 +215,11 @@ jobs: path: "${{ runner.temp }}/artifacts/" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@main +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} - name: configure aws credentials @@ -201,7 +234,11 @@ jobs: - name: Calculate docker image id: calculate-docker-image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: ${{ inputs.DOCKER_IMAGE }} @@ -211,7 +248,11 @@ jobs: - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -223,7 +264,11 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml index 636b76d42931a..98f40e28fcf23 100644 --- a/.github/workflows/_binary-upload.yml +++ b/.github/workflows/_binary-upload.yml @@ -51,6 +51,16 @@ on: required: false type: string description: Desired python version +<<<<<<< HEAD +======= + use_split_build: + description: | + [Experimental] Build a libtorch only wheel and build pytorch such that + are built from the libtorch wheel. + required: false + type: boolean + default: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: required: true @@ -79,9 +89,16 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} PYTORCH_FINAL_PACKAGE_DIR: /artifacts SHA1: ${{ github.event.pull_request.head.sha || github.sha }} +<<<<<<< HEAD steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + USE_SPLIT_BUILD: ${{ inputs.use_split_build }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index ebf96264e9944..7f2a81569d1b6 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -67,6 +67,7 @@ jobs: # an OOM issue when running the job, so this upgrades the runner from 4xlarge # to the next available tier of 12xlarge. So much memory just to generate cpp # doc +<<<<<<< HEAD runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) # Let's try to figure out how this can be improved @@ -75,12 +76,30 @@ jobs: runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge # It takes less than 30m to finish python docs unless there are issues timeout-minutes: 30 +======= + runner: ${{ inputs.runner_prefix }}linux.12xlarge + # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) + # Let's try to figure out how this can be improved + timeout-minutes: 240 + - docs_type: python + runner: ${{ inputs.runner_prefix }}linux.2xlarge + # It takes less than 30m to finish python docs unless there are issues + timeout-minutes: 30 + - docs_type: functorch + runner: ${{ inputs.runner_prefix }}linux.2xlarge + # It takes less than 15m to finish functorch docs unless there are issues + timeout-minutes: 15 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180) # The current name requires updating the database last docs push query from test-infra every time the matrix is updated name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -91,7 +110,11 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup Linux uses: ./.github/actions/setup-linux @@ -106,12 +129,20 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -207,6 +238,21 @@ jobs: path: cppdocs/ s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs +<<<<<<< HEAD - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + - name: Upload functorch Docs Preview + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }} + with: + retention-days: 14 + s3-bucket: doc-previews + if-no-files-found: error + path: functorch_ghpages/nightly/ + s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml index 014e6106b0730..9287aa304b2af 100644 --- a/.github/workflows/_link_check.yml +++ b/.github/workflows/_link_check.yml @@ -11,9 +11,14 @@ on: jobs: lint-urls: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: job-name: lint-urls +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout: 120 runner: ${{ inputs.runner }}linux.2xlarge docker-image: ci-image:pytorch-linux-jammy-linter @@ -37,9 +42,14 @@ jobs: lint-xrefs: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: job-name: lint-xrefs +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout: 60 runner: ${{ inputs.runner }}linux.2xlarge docker-image: ci-image:pytorch-linux-jammy-linter diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index cc0064391fdef..6790c0d482289 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -16,6 +16,14 @@ on: type: boolean default: true description: If set, upload generated build artifacts. +<<<<<<< HEAD +======= + build-with-debug: + required: false + type: boolean + default: false + description: If set, build in debug mode. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sync-tag: required: false type: string @@ -37,7 +45,11 @@ on: runner: required: false type: string +<<<<<<< HEAD default: "linux.c7i.2xlarge" +======= + default: "linux.2xlarge" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) description: | Label of the runner this job should run on. test-matrix: @@ -64,6 +76,14 @@ on: required: false type: string default: "" +<<<<<<< HEAD +======= + max-jobs: + description: | + Overwrite the number of jobs to use for the build + required: false + type: string +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) disable-monitor: description: | Disable utilization monitoring for build job @@ -82,6 +102,10 @@ on: required: false type: number default: 1 +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) allow-reuse-old-whl: description: | If set, the build try to pull an old wheel from s3 that was built on a @@ -89,6 +113,7 @@ on: required: false type: boolean default: true +<<<<<<< HEAD build-additional-packages: description: | If set, the build job will also builds these packages and saves their @@ -103,6 +128,8 @@ on: required: false type: string default: "" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: HUGGING_FACE_HUB_TOKEN: @@ -114,6 +141,10 @@ on: description: | FB app token to write to scribe endpoint +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) outputs: docker-image: value: ${{ jobs.build.outputs.docker-image }} @@ -128,12 +159,17 @@ jobs: # Don't run on forked repos if: github.repository_owner == 'pytorch' runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }} +<<<<<<< HEAD timeout-minutes: 480 +======= + timeout-minutes: 240 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) outputs: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: @@ -141,13 +177,23 @@ jobs: instructions: | Build is done inside the container, to start an interactive session run: docker exec -it $(docker container ps --format '{{.ID}}') bash +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + if: inputs.build-environment != 'linux-s390x-binary-manywheel' + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # [pytorch repo ref] # Use a pytorch/pytorch reference instead of a reference to the local # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -183,7 +229,11 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image-name }} @@ -199,7 +249,11 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -232,7 +286,11 @@ jobs: MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | mkdir -p ../../usage_logs +<<<<<<< HEAD python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 +======= + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python3 -m tools.stats.monitor \ --log-interval "$MONITOR_LOG_INTERVAL" \ --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \ @@ -254,6 +312,11 @@ jobs: env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} BRANCH: ${{ steps.parse-ref.outputs.branch }} +<<<<<<< HEAD +======= + # TODO duplicated + AWS_DEFAULT_REGION: us-east-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs @@ -265,11 +328,19 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} +<<<<<<< HEAD OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }} RUNNER: ${{ inputs.runner }} +======= + DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} + OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | START_TIME=$(date +%s) if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then @@ -289,12 +360,22 @@ jobs: DOCKER_SHELL_CMD= fi +<<<<<<< HEAD +======= + if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then + MAX_JOBS="$(nproc --ignore=2)" + else + MAX_JOBS="${MAX_JOBS_OVERRIDE}" + fi + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Leaving 1GB for the runner and other things TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap # comes from https://github.com/pytorch/test-infra/pull/6058 TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) +<<<<<<< HEAD if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then # EC2 specific setup for RISC-V emulation # Ensure binfmt_misc is available @@ -320,13 +401,22 @@ jobs: RISCV_DOCKER_ARGS= fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # detached container should get cleaned up by teardown_ec2_linux # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty # shellcheck disable=SC2086 container_name=$(docker run \ +<<<<<<< HEAD ${RISCV_DOCKER_ARGS} \ -e BUILD_ENVIRONMENT \ -e MAX_JOBS="$(nproc --ignore=2)" \ +======= + -e BUILD_ENVIRONMENT \ + -e MAX_JOBS=${MAX_JOBS} \ + -e MAX_JOBS_OVERRIDE \ + -e AWS_DEFAULT_REGION \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -e PR_NUMBER \ -e SHA1 \ -e BRANCH \ @@ -340,8 +430,12 @@ jobs: -e OUR_GITHUB_JOB_ID \ -e HUGGING_FACE_HUB_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ +<<<<<<< HEAD -e BUILD_ADDITIONAL_PACKAGES \ -e RUNNER \ +======= + -e USE_SPLIT_BUILD \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ @@ -355,16 +449,20 @@ jobs: "${USED_IMAGE}" \ ${DOCKER_SHELL_CMD} ) +<<<<<<< HEAD if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then docker exec -t "${container_name}" sh -c "python3 -m pip install -r requirements.txt" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh' END_TIME=$(date +%s) echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT" +<<<<<<< HEAD - name: Build external packages id: build-external-packages if: inputs.build-external-packages != '' && steps.build.outcome != 'skipped' @@ -385,6 +483,8 @@ jobs: mv "$src" "dist/$(dirname "$src")/" fi +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Stop monitoring script if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }} shell: bash @@ -457,7 +557,11 @@ jobs: artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' - name: Cleanup docker diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 29c2fc8e08476..244f12494512d 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -72,10 +72,13 @@ on: required: false description: | HF Auth token to avoid rate limits when downloading models or datasets from hub +<<<<<<< HEAD VLLM_TEST_HUGGING_FACE_TOKEN: required: false description: | HF Auth token to test vllm +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SCRIBE_GRAPHQL_ACCESS_TOKEN: required: false description: | @@ -94,6 +97,7 @@ jobs: environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }} runs-on: ${{ matrix.runner }} timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} +<<<<<<< HEAD permissions: id-token: write contents: read @@ -101,6 +105,12 @@ jobs: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} +======= + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -108,6 +118,7 @@ jobs: docker exec -it $(docker container ps --format '{{.ID}}') bash - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: no-sudo: true @@ -125,12 +136,25 @@ jobs: - name: configure aws credentials if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + with: + no-sudo: true + + - name: Setup Linux + uses: ./.github/actions/setup-linux + if: inputs.build-environment != 'linux-s390x-binary-manywheel' + + - name: configure aws credentials + if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-test aws-region: us-east-1 +<<<<<<< HEAD - name: Login to Amazon ECR if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }} id: login-ecr @@ -140,6 +164,11 @@ jobs: - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image }} @@ -155,7 +184,11 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -167,20 +200,33 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@main with: driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 + if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup GPU_FLAG for docker run id: setup-gpu-flag run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" +<<<<<<< HEAD if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }} +======= + if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container id: setup-sscache-port-flag run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" +<<<<<<< HEAD if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }} +======= + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Lock NVIDIA A100 40GB Frequency run: | @@ -209,7 +255,11 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | +<<<<<<< HEAD python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 +======= + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" @@ -224,6 +274,7 @@ jobs: continue-on-error: true uses: ./.github/actions/download-td-artifacts +<<<<<<< HEAD - name: Download Windows torch wheel for cross-compilation if: matrix.win_torch_wheel_artifact != '' uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0 @@ -264,6 +315,8 @@ jobs: echo "CUDA libraries:" ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -287,12 +340,15 @@ jobs: run: | echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" +<<<<<<< HEAD - name: Preserve github env variables for use in docker shell: bash run: | env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Test id: test timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} @@ -313,8 +369,11 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} +<<<<<<< HEAD EXTRA_FLAGS: ${{ matrix.extra_flags || '' }} OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} @@ -323,8 +382,13 @@ jobs: NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs +<<<<<<< HEAD SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }} SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }} +======= + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} @@ -332,9 +396,15 @@ jobs: PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} +<<<<<<< HEAD VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} +======= + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} run: | set -x @@ -360,6 +430,13 @@ jobs: # if for some reason cleanup action doesn't stop container # when job is cancelled DOCKER_SHELL_CMD="sleep 12h" +<<<<<<< HEAD +======= + + # since some steps are skipped on s390x, if they are necessary, run them here + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else SHM_OPTS="--shm-size=${SHM_SIZE}" JENKINS_USER="--user jenkins" @@ -409,9 +486,15 @@ jobs: -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e HUGGING_FACE_HUB_TOKEN \ +<<<<<<< HEAD -e VLLM_TEST_HUGGING_FACE_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e DASHBOARD_TAG \ +======= + -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ + -e DASHBOARD_TAG \ + -e IS_A100_RUNNER \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -e ARTIFACTS_FILE_SUFFIX \ --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ @@ -429,6 +512,11 @@ jobs: "${DOCKER_IMAGE}" \ ${DOCKER_SHELL_CMD} ) +<<<<<<< HEAD +======= + # Propagate download.pytorch.org IP to container + grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then @@ -448,6 +536,7 @@ jobs: test_config: ${{ matrix.config }} job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} +<<<<<<< HEAD - name: Authenticate with AWS if: ${{ always() && contains(matrix.runner, 'b200') }} uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 @@ -459,6 +548,10 @@ jobs: - name: Upload the benchmark results uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main +======= + - name: Upload the benchmark results + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: benchmark-results-dir: test/test-reports @@ -516,7 +609,11 @@ jobs: workflow_attempt: ${{github.run_attempt}} - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' # NB: We are currently having an intermittent GPU-related issue on G5 runners with diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index 24fe510f0fb59..bdd40b4674a6d 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -67,11 +67,19 @@ jobs: test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Clean up disk space before running MacOS workflow +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/check-disk-space@main # [see note: pytorch repo ref] - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 + + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Set xcode version env: @@ -82,10 +90,17 @@ jobs: fi - name: Setup Python +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-python@main with: python-version: ${{ inputs.python-version }} pip-requirements-file: .ci/docker/requirements-ci.txt +======= + uses: pytorch/test-infra/.github/actions/setup-python@release/2.8 + with: + python-version: ${{ inputs.python-version }} + pip-requirements-file: .github/requirements/pip-requirements-macOS.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Install sccache (only for non-forked PRs, and pushes to trunk) uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 @@ -123,7 +138,11 @@ jobs: else # The runner has access to the S3 bucket via IAM profile without the need # for any credential +<<<<<<< HEAD echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" +======= + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}" fi @@ -152,14 +171,27 @@ jobs: env: OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} run: | +<<<<<<< HEAD # TODO: Remove me later, and properly activate venv PATH="$VENV_PATH/bin:$PATH" export PATH +======= + echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" + + if [[ -n "$CONDA_ENV" ]]; then + # Use binaries under conda environment + export PATH="$CONDA_ENV/bin":$PATH + fi +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: Same trick as Linux, there is no need to initialize sccache with the risk of getting # it hangs or timeout at initialization. The cache will be started automatically export SKIP_SCCACHE_INITIALIZATION=1 +<<<<<<< HEAD .ci/pytorch/macos-build.sh +======= + ${CONDA_RUN} .ci/pytorch/macos-build.sh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Archive artifacts into zip if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' @@ -188,4 +220,8 @@ jobs: - name: Clean up disk space if: always() continue-on-error: true +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/check-disk-space@main +======= + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 82eb3c4bf2c75..b065453b7f009 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -88,6 +88,7 @@ jobs: pkill "${PROCESS}" || true done +<<<<<<< HEAD - name: Clean up brew miniconda, if installed continue-on-error: true run: | @@ -95,6 +96,11 @@ jobs: brew uninstall miniconda echo "REINSTALL_BREW_MINICONDA=1" >> "${GITHUB_ENV}" fi +======= + - name: Clean up leftover miniconda installation + continue-on-error: true + run: brew uninstall miniconda || true +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Clean up leftover local python3 site-packages on MacOS pet runner continue-on-error: true @@ -105,11 +111,19 @@ jobs: done - name: Clean up disk space before running MacOS workflow +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/check-disk-space@main # [see note: pytorch repo ref] - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 + + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Get workflow job id id: get-job-id @@ -118,12 +132,15 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} +<<<<<<< HEAD - name: Setup Python uses: pytorch/test-infra/.github/actions/setup-python@main with: python-version: ${{ inputs.python-version }} pip-requirements-file: .ci/docker/requirements-ci.txt +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Start monitoring script id: monitor-script if: ${{ !inputs.disable-monitor }} @@ -136,8 +153,13 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | +<<<<<<< HEAD "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7 "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & +======= + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 + python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts @@ -152,6 +174,16 @@ jobs: with: use-gha: true +<<<<<<< HEAD +======= + - name: Setup Python + uses: pytorch/test-infra/.github/actions/setup-python@release/2.8 + with: + python-version: ${{ inputs.python-version }} + pip-requirements-file: .github/requirements/pip-requirements-macOS.txt + default-packages: "" + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -202,7 +234,11 @@ jobs: set -ex # TODO: Remove me later, and properly activate venv +<<<<<<< HEAD PATH="$VENV_PATH/bin:$PATH" +======= + PATH="$(dirname "$(which python)"):$PATH" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) export PATH # Print out some information about the test environment @@ -257,7 +293,11 @@ jobs: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} - name: Upload the benchmark results +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main +======= + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: benchmark-results-dir: test/test-reports dry-run: false @@ -276,6 +316,7 @@ jobs: workflow_attempt: ${{github.run_attempt}} local_path: usage_log.txt +<<<<<<< HEAD - name: Reinstall brew miniconda, if was installed if: always() continue-on-error: true @@ -288,3 +329,9 @@ jobs: if: always() continue-on-error: true uses: pytorch/test-infra/.github/actions/check-disk-space@main +======= + - name: Clean up disk space + if: always() + continue-on-error: true + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 43ed76a63cc67..35c814ae5afe0 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -62,11 +62,14 @@ on: required: false type: number default: 1 +<<<<<<< HEAD secrets: HUGGING_FACE_HUB_TOKEN: required: false description: | HF Auth token to avoid rate limits when downloading models or datasets from hub +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} @@ -81,17 +84,27 @@ jobs: strategy: matrix: ${{ fromJSON(inputs.test-matrix) }} fail-fast: false +<<<<<<< HEAD runs-on: ${{ matrix.runner }} timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} + runs-on: ${{ matrix.runner }} + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true - name: Setup ROCm uses: ./.github/actions/setup-rocm +<<<<<<< HEAD - name: Runner check GPU count (distributed jobs) if: ${{ contains(matrix.config, 'distributed') }} shell: bash @@ -105,11 +118,33 @@ jobs: - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -122,9 +157,12 @@ jobs: - name: Start monitoring script id: monitor-script +<<<<<<< HEAD if: ${{ !inputs.disable-monitor }} shell: bash continue-on-error: true +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: JOB_ID: ${{ steps.get-job-id.outputs.job-id }} JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} @@ -132,8 +170,16 @@ jobs: WORKFLOW_RUN_ID: ${{github.run_id}} MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} +<<<<<<< HEAD run: | python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 +======= + if: ${{ !inputs.disable-monitor }} + shell: bash + continue-on-error: true + run: | + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" @@ -169,12 +215,15 @@ jobs: run: | echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" +<<<<<<< HEAD - name: Preserve github env variables for use in docker shell: bash run: | env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Test id: test env: @@ -190,22 +239,35 @@ jobs: JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} +<<<<<<< HEAD BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} +<<<<<<< HEAD +======= + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DOCKER_IMAGE: ${{ inputs.docker-image }} PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} +<<<<<<< HEAD HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} run: | set -x @@ -235,7 +297,10 @@ jobs: -e GITHUB_RUN_ATTEMPT \ -e JOB_ID \ -e JOB_NAME \ +<<<<<<< HEAD -e BASE_SHA \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -e BRANCH \ -e SHA1 \ -e AWS_DEFAULT_REGION \ @@ -253,12 +318,18 @@ jobs: -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e TESTS_TO_INCLUDE \ +<<<<<<< HEAD -e HUGGING_FACE_HUB_TOKEN \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) -e DASHBOARD_TAG \ --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --ulimit core=0 \ +<<<<<<< HEAD --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --shm-size="8g" \ @@ -281,8 +352,13 @@ jobs: # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" +<<<<<<< HEAD - name: Change permissions (only needed for kubernetes runners for now) if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }} +======= + - name: Change permissions (only needed for MI300 runners for now) + if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" @@ -332,7 +408,11 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main +======= + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: benchmark-results-dir: test/test-reports dry-run: false diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml index 0d674f044ec42..2f3ed8ea446e5 100644 --- a/.github/workflows/_runner-determinator.yml +++ b/.github/workflows/_runner-determinator.yml @@ -59,7 +59,11 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} steps: # - name: Checkout PyTorch +<<<<<<< HEAD # uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + # uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # with: # fetch-depth: 1 # submodules: true diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 0fd3cf7f3972e..5d57b739685c9 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -77,15 +77,26 @@ jobs: run: | git config --global core.longpaths true git config --global core.symlinks true +<<<<<<< HEAD git config --global core.ignorecase false +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock # the directory on Windows and prevent GHA from checking out as reported # in https://github.com/actions/checkout/issues/1018 git config --global core.fsmonitor false +<<<<<<< HEAD - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + - name: Clean up leftover processes on non-ephemeral Windows runner + uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8 + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -100,7 +111,11 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -148,7 +163,11 @@ jobs: BUILD_WHEEL: 1 MAX_JOBS: 8 CUDA_VERSION: ${{ inputs.cuda-version }} +<<<<<<< HEAD PYTHON_VERSION: "3.10" +======= + PYTHON_VERSION: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SCCACHE_REGION: us-east-1 @@ -168,6 +187,7 @@ jobs: run: | .ci/pytorch/win-build.sh +<<<<<<< HEAD # Collect Windows torch libs and CUDA libs for cross-compilation - name: Collect Windows CUDA libs for cross-compilation if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu' @@ -193,6 +213,8 @@ jobs: echo "Collected CUDA libs:" ls -lah /c/${{ github.run_id }}/build-results/*.lib +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Upload to github so that people can click and download artifacts - name: Upload artifacts to s3 if: steps.build.outcome != 'skipped' diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 3d2fe8a4b3fac..c1bbc5395659c 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -70,15 +70,26 @@ jobs: run: | git config --global core.longpaths true git config --global core.symlinks true +<<<<<<< HEAD git config --global core.ignorecase false +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock # the directory on Windows and prevent GHA from checking out as reported # in https://github.com/actions/checkout/issues/1018 git config --global core.fsmonitor false +<<<<<<< HEAD - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + - name: Clean up leftover processes on non-ephemeral Windows runner + uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8 + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -94,7 +105,11 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -103,6 +118,21 @@ jobs: with: cuda-version: ${{ inputs.cuda-version }} +<<<<<<< HEAD +======= + # TODO: Move to a requirements.txt file for windows + - name: Install pip dependencies + uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 + with: + shell: bash + timeout_minutes: 5 + max_attempts: 5 + retry_wait_seconds: 30 + command: | + set -eu + python3 -m pip install 'xdoctest>=1.1.0' + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -124,7 +154,11 @@ jobs: continue-on-error: true run: | # Windows conda doesn't have python3 binary, only python, but it's python3 +<<<<<<< HEAD ${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 +======= + ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" @@ -169,7 +203,11 @@ jobs: env: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 +<<<<<<< HEAD PYTHON_VERSION: "3.10" +======= + PYTHON_VERSION: 3.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} @@ -257,6 +295,18 @@ jobs: shell: bash run: python3 .github/scripts/parse_ref.py +<<<<<<< HEAD +======= + - name: Uninstall PyTorch + if: always() + continue-on-error: true + shell: bash + run: | + # This step removes PyTorch installed by the test to give a clean slate + # to the next job + python3 -mpip uninstall -y torch + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Teardown Windows uses: ./.github/actions/teardown-win if: always() diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index 7aa7608924487..f8dcf6d2a03e6 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -77,7 +77,11 @@ jobs: steps: # [see note: pytorch repo ref] - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup XPU uses: ./.github/actions/setup-xpu @@ -95,7 +99,11 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ inputs.docker-image }} @@ -109,7 +117,11 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -133,7 +145,11 @@ jobs: MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} run: | +<<<<<<< HEAD python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 +======= + python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" @@ -191,6 +207,12 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} +<<<<<<< HEAD +======= + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 + SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} @@ -275,7 +297,11 @@ jobs: - name: Change permissions if: ${{ always() && steps.test.conclusion }} run: | +<<<<<<< HEAD docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test" +======= + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Print remaining test logs shell: bash diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml index 8318286cccbee..a4d5c04274052 100644 --- a/.github/workflows/build-almalinux-images.yml +++ b/.github/workflows/build-almalinux-images.yml @@ -36,10 +36,17 @@ jobs: runs-on: linux.9xlarge.ephemeral strategy: matrix: +<<<<<<< HEAD tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"] steps: - name: Build docker image uses: pytorch/pytorch/.github/actions/binary-docker-build@main +======= + tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"] + steps: + - name: Build docker image + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: almalinux-builder custom-tag-prefix: ${{matrix.tag}} diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index c67281e0a112b..70d7c80609f6b 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -32,7 +32,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -48,17 +52,29 @@ jobs: fail-fast: false matrix: include: [ +<<<<<<< HEAD { tag: "cuda13.0" }, { tag: "cuda12.9" }, { tag: "cuda12.8" }, { tag: "cuda12.6" }, { tag: "rocm6.4" }, { tag: "rocm7.0" }, +======= + { tag: "cuda12.9" }, + { tag: "cuda12.8" }, + { tag: "cuda12.6" }, + { tag: "rocm6.3" }, + { tag: "rocm6.4" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { tag: "cpu" }, ] steps: - name: Build docker image +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/binary-docker-build@main +======= + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: libtorch-cxx11-builder custom-tag-prefix: ${{ matrix.tag }} diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml index be8f613169e8c..d96f1505826ce 100644 --- a/.github/workflows/build-magma-linux.yml +++ b/.github/workflows/build-magma-linux.yml @@ -34,7 +34,11 @@ jobs: id-token: write strategy: matrix: +<<<<<<< HEAD cuda_version: ["130", "129", "128", "126"] +======= + cuda_version: ["129", "128", "126"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Checkout PyTorch uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/build-magma-rocm-linux.yml b/.github/workflows/build-magma-rocm-linux.yml index eaeb741e56394..d7a6aadd7e380 100644 --- a/.github/workflows/build-magma-rocm-linux.yml +++ b/.github/workflows/build-magma-rocm-linux.yml @@ -34,7 +34,11 @@ jobs: id-token: write strategy: matrix: +<<<<<<< HEAD rocm_version: ["70", "64"] +======= + rocm_version: ["64", "63"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Checkout PyTorch uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml index b7d293a5cec11..fc5ea76151cd5 100644 --- a/.github/workflows/build-magma-windows.yml +++ b/.github/workflows/build-magma-windows.yml @@ -22,7 +22,11 @@ jobs: runs-on: windows-2022 strategy: matrix: +<<<<<<< HEAD cuda_version: ["130", "129", "128", "126"] +======= + cuda_version: ["129", "128", "126"] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) config: ["Release", "Debug"] env: CUDA_VERSION: ${{ matrix.cuda_version }} diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml index c498e169f1aa5..b7fbb55df6fe4 100644 --- a/.github/workflows/build-manywheel-images-s390x.yml +++ b/.github/workflows/build-manywheel-images-s390x.yml @@ -25,7 +25,11 @@ jobs: runs-on: linux.s390x steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false no-sudo: true diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index a5c5c387adb82..f9ba2a402657f 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -32,7 +32,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -46,6 +50,7 @@ jobs: fail-fast: false matrix: include: [ +<<<<<<< HEAD { name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" }, @@ -58,13 +63,29 @@ jobs: { name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" }, +======= + { name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" }, ] runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }} name: ${{ matrix.name }}:${{ matrix.tag }} steps: - name: Build docker image +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/binary-docker-build@main +======= + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ${{ matrix.name }} custom-tag-prefix: ${{ matrix.tag }} diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 9e4144ae56c2d..f350ba334045d 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -3,12 +3,19 @@ name: Build Triton wheels on: push: branches: +<<<<<<< HEAD - main +======= + - release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) tags: # NOTE: Binary build pipelines should only get triggered on release candidate builds # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ +<<<<<<< HEAD - 'ciflow/triton_binaries/*' +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) paths: - .github/workflows/build-triton-wheel.yml - .github/scripts/build_triton_wheel.py @@ -36,7 +43,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -50,12 +61,20 @@ jobs: strategy: fail-fast: false matrix: +<<<<<<< HEAD py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] +======= + py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) device: ["cuda", "rocm", "xpu", "aarch64"] docker-image: ["pytorch/manylinux2_28-builder:cpu"] include: - device: "rocm" +<<<<<<< HEAD rocm_version: "7.0" +======= + rocm_version: "6.4" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" - device: "cuda" rocm_version: "" @@ -74,12 +93,20 @@ jobs: PLATFORM: 'manylinux_2_28_x86_64' steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false @@ -87,7 +114,11 @@ jobs: uses: ./.github/actions/setup-linux - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ env.DOCKER_IMAGE }} @@ -108,6 +139,12 @@ jobs: # Determine python executable for given version case $PY_VERS in +<<<<<<< HEAD +======= + 3.9) + PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python + ;; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 3.10) PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python ;; @@ -123,12 +160,15 @@ jobs: 3.13t) PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python ;; +<<<<<<< HEAD 3.14) PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python ;; 3.14t) PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python ;; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) *) echo "Unsupported python version ${PY_VERS}" exit 1 @@ -142,7 +182,11 @@ jobs: fi docker exec -t "${container_name}" yum install -y zlib-devel zip +<<<<<<< HEAD docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel +======= + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set +e docker exec -t "${container_name}" command -v pip has_pip=$? @@ -181,7 +225,11 @@ jobs: path: ${{ runner.temp }}/artifacts/wheelhouse/* - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() build-wheel-win: @@ -191,7 +239,11 @@ jobs: strategy: fail-fast: false matrix: +<<<<<<< HEAD py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] +======= + py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) device: ["xpu"] timeout-minutes: 40 env: @@ -214,7 +266,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml index 44430522b79d8..679d8028c6e69 100644 --- a/.github/workflows/check-labels.yml +++ b/.github/workflows/check-labels.yml @@ -38,7 +38,11 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/check_mergeability_ghstack.yml b/.github/workflows/check_mergeability_ghstack.yml index 569a174665ba8..c94545096896f 100644 --- a/.github/workflows/check_mergeability_ghstack.yml +++ b/.github/workflows/check_mergeability_ghstack.yml @@ -56,7 +56,11 @@ jobs: cache: pip architecture: x64 +<<<<<<< HEAD - run: pip install pyyaml==6.0.2 +======= + - run: pip install pyyaml==6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shell: bash - name: Verify mergeability diff --git a/.github/workflows/cherry-pick.yml b/.github/workflows/cherry-pick.yml index 310857782ea14..3153a0fc07175 100644 --- a/.github/workflows/cherry-pick.yml +++ b/.github/workflows/cherry-pick.yml @@ -26,7 +26,11 @@ jobs: cache: pip # Not the direct dependencies but the script uses trymerge +<<<<<<< HEAD - run: pip install pyyaml==6.0.2 +======= + - run: pip install pyyaml==6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup committer id run: | diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml index bef3d8797149c..d20a1407f39a5 100644 --- a/.github/workflows/close-nonexistent-disable-issues.yml +++ b/.github/workflows/close-nonexistent-disable-issues.yml @@ -13,7 +13,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index d5e0d96fe19f2..c7f79fd9f2be9 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -19,7 +19,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -35,7 +39,10 @@ jobs: contents: write outputs: pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} +<<<<<<< HEAD pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -54,6 +61,7 @@ jobs: tag_or_branch="${tag_or_branch#refs/heads/}" # replace directory separators with _ in branch name tag_or_branch="${tag_or_branch//\//_}" +<<<<<<< HEAD torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')" { echo "PT_RELEASE_NAME=pytorch-$tag_or_branch"; @@ -84,19 +92,43 @@ jobs: pip install build==1.2.2.post1 || exit 1 python -m build --sdist || exit 1 cd dist || exit 1 +======= + echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV" + echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV" + - name: Checkout optional submodules + run: python3 tools/optional_submodules.py + - name: Create source distribution + run: | + # Create new folder with specified name so extracting the archive yields that + rm -rf "/tmp/$PT_RELEASE_NAME" + cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" + mv "/tmp/$PT_RELEASE_NAME" . + # Cleanup + rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} + find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true + # Create archive + tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" + echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Upload source distribution for release if: ${{ github.event_name == 'release' }} uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 with: +<<<<<<< HEAD files: | ${{ env.PT_RELEASE_FILE }} ${{ env.PT_PEP517_RELEASE_FILE }} - name: Upload source distribution to GHA artifacts # for release tags +======= + files: ${{env.PT_RELEASE_FILE}} + - name: Upload source distribution to GHA artifacts for release tags +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: ${{ env.PT_RELEASE_FILE }} path: ${{ env.PT_RELEASE_FILE }} +<<<<<<< HEAD - name: Upload PEP 517 source distribution to GHA artifacts # for release tags if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 @@ -110,6 +142,11 @@ jobs: echo "pt_release_name=${{ env.PT_RELEASE_FILE }}"; echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}"; } >> "${GITHUB_OUTPUT}" +======= + - name: Set output + id: release_name + run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) upload_source_code_to_s3: if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} @@ -125,9 +162,12 @@ jobs: - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 with: name: ${{ needs.release.outputs.pt_release_name }} +<<<<<<< HEAD - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 with: name: ${{ needs.release.outputs.pt_pep517_release_name }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Configure AWS credentials(PyTorch account) uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: @@ -138,9 +178,13 @@ jobs: s3-bucket: pytorch s3-prefix: source_code/test if-no-files-found: warn +<<<<<<< HEAD path: | ${{ needs.release.outputs.pt_release_name }} ${{ needs.release.outputs.pt_pep517_release_name }} +======= + path: ${{ needs.release.outputs.pt_release_name }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index ca257ee8225ad..bb533842d3717 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -33,7 +33,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -50,6 +54,7 @@ jobs: runner: [linux.12xlarge] docker-image-name: [ pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, +<<<<<<< HEAD pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11, pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm, pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks, @@ -73,6 +78,34 @@ jobs: pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3.12-triton-cpu, pytorch-linux-noble-riscv64-py3.12-gcc14 +======= + pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9, + pytorch-linux-jammy-py3.9-clang12, + pytorch-linux-jammy-py3.11-clang12, + pytorch-linux-jammy-py3.12-clang12, + pytorch-linux-jammy-py3.13-clang12, + pytorch-linux-jammy-rocm-n-1-py3, + pytorch-linux-jammy-rocm-n-py3, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12, + pytorch-linux-jammy-py3.9-gcc11, + pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks, + pytorch-linux-jammy-py3.12-halide, + pytorch-linux-jammy-xpu-2025.0-py3, + pytorch-linux-jammy-xpu-2025.1-py3, + pytorch-linux-jammy-py3-clang15-asan, + pytorch-linux-jammy-py3-clang18-asan, + pytorch-linux-jammy-py3-clang12-onnx, + pytorch-linux-jammy-linter, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, + pytorch-linux-jammy-py3-clang12-executorch, + pytorch-linux-jammy-py3.12-triton-cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] include: - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 @@ -94,21 +127,33 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required for git merge-base - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup Linux uses: ./.github/actions/setup-linux - name: Build docker image id: build-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ci-image:${{ matrix.docker-image-name }} always-rebuild: true push: true - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.build-docker-image.outputs.docker-image }} @@ -121,7 +166,11 @@ jobs: GHCR_PAT: ${{ secrets.GHCR_PAT }} with: shell: bash +<<<<<<< HEAD timeout_minutes: 60 +======= + timeout_minutes: 30 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) max_attempts: 5 retry_wait_seconds: 90 command: | @@ -139,5 +188,9 @@ jobs: if: always() - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml index 02c1171c567aa..2f5e2e5e60ca0 100644 --- a/.github/workflows/docker-cache-mi300.yml +++ b/.github/workflows/docker-cache-mi300.yml @@ -20,7 +20,11 @@ jobs: runs-on: rocm-docker steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: no-sudo: true @@ -39,13 +43,21 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 push: false - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 2560ebf7912aa..becdca64080b0 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -37,7 +37,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -52,7 +56,11 @@ jobs: matrix: ${{ steps.generate-matrix.outputs.matrix }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: true @@ -82,7 +90,11 @@ jobs: CUDNN_VERSION: ${{ matrix.cudnn_version }} steps: - name: Setup SSH (Click me for login details) +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] @@ -144,7 +156,11 @@ jobs: run: | make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image" - name: Push nightly tags +<<<<<<< HEAD if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }} +======= + if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run: | PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime" CUDA_SUFFIX="-cu${CUDA_VERSION}" @@ -164,12 +180,22 @@ jobs: fi - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() validate: needs: build +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main with: channel: nightly +======= + uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.8 + with: + channel: test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ref: main diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index b8a6403faffbd..de5fa61edafe6 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -41,12 +41,135 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + manywheel-py3_9-cpu-aarch64-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_9-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cpu-aarch64-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cpu-aarch64-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cpu-aarch64 + build_environment: linux-aarch64-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.2xlarge + ALPINE_IMAGE: "arm64v8/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cpu-aarch64-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cpu-aarch64-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-aarch64 + DOCKER_IMAGE: manylinux2_28_aarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cpu-aarch64 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_9-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-cpu-aarch64-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -60,9 +183,16 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.r7g.12xlarge.memory +======= + use_split_build: False + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -83,6 +213,10 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -106,12 +240,17 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -204,6 +343,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-cuda-aarch64-12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -214,6 +355,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder @@ -225,6 +367,20 @@ jobs: build_name: manywheel-py3_10-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -240,16 +396,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda-aarch64-12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -296,6 +461,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-cpu-aarch64-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -309,9 +476,16 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.r7g.12xlarge.memory +======= + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -332,6 +506,10 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -355,12 +533,17 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -453,6 +636,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-cuda-aarch64-12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -463,6 +648,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder @@ -474,6 +660,20 @@ jobs: build_name: manywheel-py3_11-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -489,16 +689,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda-aarch64-12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -545,6 +754,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_12-cpu-aarch64-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -558,9 +769,16 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.r7g.12xlarge.memory +======= + use_split_build: False + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -581,6 +799,10 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -604,12 +826,17 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -702,6 +929,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_12-cuda-aarch64-12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -712,6 +941,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder @@ -723,6 +953,20 @@ jobs: build_name: manywheel-py3_12-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -738,16 +982,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda-aarch64-12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -794,6 +1047,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13-cpu-aarch64-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -807,9 +1062,16 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.r7g.12xlarge.memory +======= + use_split_build: False + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -830,6 +1092,10 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -853,12 +1119,17 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -951,6 +1222,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13-cuda-aarch64-12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -961,6 +1234,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder @@ -972,6 +1246,20 @@ jobs: build_name: manywheel-py3_13-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -987,16 +1275,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda-aarch64-12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1043,6 +1340,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13t-cpu-aarch64-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1056,9 +1355,16 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.r7g.12xlarge.memory +======= + use_split_build: False + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -1079,6 +1385,10 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -1102,12 +1412,17 @@ jobs: GPU_ARCH_TYPE: cpu-aarch64 DOCKER_IMAGE: manylinux2_28_aarch64-builder DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1200,6 +1515,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13t-cuda-aarch64-12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1210,6 +1527,7 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder @@ -1221,6 +1539,20 @@ jobs: build_name: manywheel-py3_13t-cuda-aarch64-12_9 build_environment: linux-aarch64-binary-manywheel PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -1236,15 +1568,24 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9-aarch64 + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda-aarch64-12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} @@ -1789,3 +2130,5 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index 7f3277ef64a12..b9528cb738a83 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -41,7 +41,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -122,7 +126,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 @@ -145,7 +153,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 @@ -154,7 +166,11 @@ jobs: build_name: libtorch-cuda12_6-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_6-shared-with-deps-release-upload: # Uploading @@ -169,7 +185,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 @@ -190,7 +210,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 @@ -213,7 +237,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 @@ -222,7 +250,11 @@ jobs: build_name: libtorch-cuda12_8-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_8-shared-with-deps-release-upload: # Uploading @@ -237,7 +269,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 @@ -258,7 +294,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" +======= + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 @@ -281,7 +321,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" +======= + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 @@ -290,7 +334,11 @@ jobs: build_name: libtorch-cuda12_9-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading @@ -305,7 +353,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" +======= + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 @@ -316,7 +368,11 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-release-build: +======= + libtorch-rocm6_3-shared-with-deps-release-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -325,6 +381,7 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda @@ -410,6 +467,24 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: - libtorch-rocm6_4-shared-with-deps-release-build +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-rocm6_3-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-rocm6_3-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-rocm6_3-shared-with-deps-release-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -418,6 +493,7 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.4 GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm @@ -429,18 +505,35 @@ jobs: permissions: id-token: write contents: read +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: libtorch-rocm6_4-shared-with-deps-release +======= + name: libtorch-rocm6_3-shared-with-deps-release +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -462,7 +555,124 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: libtorch-cxx11-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_3-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-rocm6_3-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-rocm6_4-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-rocm6_4-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-rocm6_4-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-rocm6_4-shared-with-deps-release-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-rocm6_4-shared-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder @@ -470,7 +680,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -491,7 +705,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 +<<<<<<< HEAD GPU_ARCH_VERSION: "6.4" +======= + GPU_ARCH_VERSION: 6.4 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 @@ -501,6 +719,7 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD libtorch-rocm7_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} @@ -619,3 +838,5 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml new file mode 100644 index 0000000000000..1b231ca5ffb6f --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml @@ -0,0 +1,87 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch-release + + +on: + push: + branches: + - main + tags: + - 'ciflow/trunk/*' + workflow_dispatch: + +permissions: + id-token: write + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch-release + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 0 +concurrency: + group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cpu-shared-with-deps-release + build_environment: linux-binary-libtorch-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cpu-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cpu-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cpu-shared-with-deps-release + build_environment: linux-binary-libtorch-release + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml new file mode 100644 index 0000000000000..9ad095ee68b5f --- /dev/null +++ b/.github/workflows/generated-linux-binary-manywheel-main.yml @@ -0,0 +1,275 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-manywheel + + +on: + push: + branches: + - main + tags: + - 'ciflow/trunk/*' + workflow_dispatch: + +permissions: + id-token: write + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-manywheel + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 0 +concurrency: + group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + manywheel-py3_9-cuda12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_6 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_6-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_6 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + + manywheel-py3_9-cuda12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_8 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_8-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_8 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + + manywheel-py3_9-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + + manywheel-py3_9-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 5fcf4e0bd176f..48d92b918689a 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -41,12 +41,629 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + manywheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cpu + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cpu-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cpu + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cpu-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-cuda12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_6 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_6-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_6 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cuda12_6-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-cuda12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_8 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_8-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_8 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cuda12_8-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-rocm6_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-rocm6_3 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-rocm6_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-rocm6_3-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.9" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm6_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_9-rocm6_3-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-rocm6_3-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-rocm6_3 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.4 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_9-rocm6_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-rocm6_4-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-rocm6_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-xpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu + use_split_build: False + DESIRED_PYTHON: "3.9" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_9-xpu + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-xpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-xpu-build + - get-label-type + runs-on: linux.idc.xpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu + use_split_build: False + DESIRED_PYTHON: "3.9" + permissions: + id-token: write + contents: read + steps: + - name: Setup XPU + uses: ./.github/actions/setup-xpu + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_9-xpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: xpu + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown XPU + uses: ./.github/actions/teardown-xpu + manywheel-py3_9-xpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-xpu-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: xpu + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-xpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -60,6 +677,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cpu @@ -81,6 +702,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu build_environment: linux-binary-manywheel @@ -103,6 +728,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu secrets: @@ -119,15 +748,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-test: # Testing @@ -142,15 +783,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-upload: # Uploading @@ -165,10 +818,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_6 secrets: @@ -185,15 +846,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-test: # Testing @@ -208,15 +881,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-upload: # Uploading @@ -231,10 +916,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_8 secrets: @@ -251,15 +944,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_9 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_9-test: # Testing @@ -274,15 +979,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_9-upload: # Uploading @@ -297,17 +1014,29 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-cuda13_0-build: +======= + manywheel-py3_10-rocm6_3-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -316,6 +1045,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda @@ -356,19 +1086,174 @@ jobs: permissions: id-token: write contents: read - needs: manywheel-py3_10-cuda13_0-test + needs: manywheel-py3_10-cuda13_0-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda13_0 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_10-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 + build_name: manywheel-py3_10-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_10-rocm6_4-build +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_10-rocm6_3 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-rocm6_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_10-rocm6_3-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION +<<<<<<< HEAD + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DESIRED_PYTHON: "3.10" + permissions: + id-token: write + contents: read +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.10" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: +<<<<<<< HEAD + name: manywheel-py3_10-rocm6_4 +======= + name: manywheel-py3_10-rocm6_3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +<<<<<<< HEAD + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_10-rocm6_3-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-rocm6_3-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu130 - GPU_ARCH_VERSION: "13.0" - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda13_0 + build_name: manywheel-py3_10-rocm6_3 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -383,13 +1268,13 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + GPU_ARCH_VERSION: 6.4 GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -407,15 +1292,13 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + GPU_ARCH_VERSION: 6.4 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False DESIRED_PYTHON: "3.10" - permissions: - id-token: write - contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm @@ -427,7 +1310,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -449,7 +1331,8 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -457,7 +1340,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -478,16 +1365,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 +<<<<<<< HEAD GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 +======= + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_10-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -603,6 +1499,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -616,11 +1514,19 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-xpu-test: # Testing @@ -640,13 +1546,21 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" permissions: id-token: write contents: read steps: - name: Setup XPU +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/setup-xpu@main +======= + uses: ./.github/actions/setup-xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -664,7 +1578,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -675,7 +1592,11 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -683,7 +1604,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -707,6 +1632,10 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-xpu secrets: @@ -726,6 +1655,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cpu @@ -747,6 +1680,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu build_environment: linux-binary-manywheel @@ -769,6 +1706,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu secrets: @@ -785,15 +1726,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-test: # Testing @@ -808,15 +1761,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-upload: # Uploading @@ -831,10 +1796,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_6 secrets: @@ -851,15 +1824,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-test: # Testing @@ -874,15 +1859,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-upload: # Uploading @@ -897,16 +1894,95 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD +======= + manywheel-py3_11-cuda12_8-full-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-cuda12_8-full + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_8-full-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-cuda12_8-full-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_8-full + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_8-full-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda12_8-full-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_8-full + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-cuda12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -917,15 +1993,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_9 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_9-test: # Testing @@ -940,15 +2028,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_9-upload: # Uploading @@ -963,17 +2063,29 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-cuda13_0-build: +======= + manywheel-py3_11-rocm6_3-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -982,6 +2094,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda @@ -1064,6 +2177,24 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: - manywheel-py3_11-rocm6_4-build +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-rocm6_3 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-rocm6_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-rocm6_3-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -1072,6 +2203,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.4 GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm @@ -1082,18 +2214,35 @@ jobs: permissions: id-token: write contents: read +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.11" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: manywheel-py3_11-rocm6_4 +======= + name: manywheel-py3_11-rocm6_3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1115,7 +2264,124 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_11-rocm6_3-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-rocm6_3-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-rocm6_3 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.11" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_11-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1123,7 +2389,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1144,16 +2414,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 +<<<<<<< HEAD GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 +======= + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_11-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1269,6 +2548,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_11-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1282,11 +2563,19 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-xpu-test: # Testing @@ -1306,13 +2595,21 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" permissions: id-token: write contents: read steps: - name: Setup XPU +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/setup-xpu@main +======= + uses: ./.github/actions/setup-xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -1330,7 +2627,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1341,7 +2641,11 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1349,7 +2653,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1373,6 +2681,10 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-xpu secrets: @@ -1392,6 +2704,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cpu @@ -1413,6 +2729,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu build_environment: linux-binary-manywheel @@ -1435,6 +2755,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu secrets: @@ -1451,15 +2775,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-test: # Testing @@ -1474,15 +2810,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-upload: # Uploading @@ -1497,10 +2845,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_6 secrets: @@ -1517,15 +2873,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing @@ -1540,15 +2908,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-upload: # Uploading @@ -1563,10 +2943,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_8 secrets: @@ -1583,15 +2971,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_9 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_9-test: # Testing @@ -1606,15 +3006,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_9-upload: # Uploading @@ -1629,17 +3041,29 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-cuda13_0-build: +======= + manywheel-py3_12-rocm6_3-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1648,6 +3072,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda @@ -1730,6 +3155,24 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: - manywheel-py3_12-rocm6_4-build +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_12-rocm6_3 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-rocm6_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_12-rocm6_3-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -1738,6 +3181,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.4 GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm @@ -1748,18 +3192,35 @@ jobs: permissions: id-token: write contents: read +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.12" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: manywheel-py3_12-rocm6_4 +======= + name: manywheel-py3_12-rocm6_3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1781,7 +3242,124 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_12-rocm6_3-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-rocm6_3-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-rocm6_3 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_12-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_12-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_12-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.12" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_12-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1789,7 +3367,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1810,16 +3392,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 +<<<<<<< HEAD GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 +======= + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_12-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1935,6 +3526,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_12-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1948,11 +3541,19 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-xpu-test: # Testing @@ -1972,13 +3573,21 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" permissions: id-token: write contents: read steps: - name: Setup XPU +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/setup-xpu@main +======= + uses: ./.github/actions/setup-xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -1996,7 +3605,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2007,7 +3619,11 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2015,7 +3631,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2039,6 +3659,10 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-xpu secrets: @@ -2058,6 +3682,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cpu @@ -2079,6 +3707,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu build_environment: linux-binary-manywheel @@ -2101,6 +3733,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu secrets: @@ -2117,15 +3753,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-test: # Testing @@ -2140,15 +3788,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-upload: # Uploading @@ -2163,10 +3823,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_6 secrets: @@ -2183,15 +3851,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-test: # Testing @@ -2206,15 +3886,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-upload: # Uploading @@ -2229,10 +3921,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_8 secrets: @@ -2249,15 +3949,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_9 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_9-test: # Testing @@ -2272,15 +3984,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_9-upload: # Uploading @@ -2295,78 +4019,246 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-cuda13_0-build: +======= + manywheel-py3_13-rocm6_3-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION +<<<<<<< HEAD + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13-cuda13_0 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda13_0-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13-cuda13_0-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda13_0 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-cuda13_0-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda13_0 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 + build_name: manywheel-py3_13-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu130 - GPU_ARCH_VERSION: "13.0" - GPU_ARCH_TYPE: cuda + needs: + - manywheel-py3_13-rocm6_4-build +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13-cuda13_0 + build_name: manywheel-py3_13-rocm6_3 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda13_0-test: # Testing + manywheel-py3_13-rocm6_3-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13-cuda13_0-build + - manywheel-py3_13-rocm6_3-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu130 - GPU_ARCH_VERSION: "13.0" - GPU_ARCH_TYPE: cuda +<<<<<<< HEAD + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda13_0 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda13_0-upload: # Uploading + permissions: + id-token: write + contents: read +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.13" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: +<<<<<<< HEAD + name: manywheel-py3_13-rocm6_4 +======= + name: manywheel-py3_13-rocm6_3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +<<<<<<< HEAD + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image +<<<<<<< HEAD + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_13-rocm6_3-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-cuda13_0-test + needs: manywheel-py3_13-rocm6_3-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu130 - GPU_ARCH_VERSION: "13.0" - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda13_0 + build_name: manywheel-py3_13-rocm6_3 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2381,13 +4273,13 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + GPU_ARCH_VERSION: 6.4 GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -2405,15 +4297,13 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + GPU_ARCH_VERSION: 6.4 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False DESIRED_PYTHON: "3.13" - permissions: - id-token: write - contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm @@ -2425,7 +4315,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2447,7 +4336,8 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2455,7 +4345,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2476,16 +4370,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 +<<<<<<< HEAD GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 +======= + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2601,6 +4504,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2614,11 +4519,19 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-xpu-test: # Testing @@ -2638,13 +4551,21 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" permissions: id-token: write contents: read steps: - name: Setup XPU +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/setup-xpu@main +======= + uses: ./.github/actions/setup-xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -2662,7 +4583,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2673,7 +4597,11 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2681,7 +4609,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2705,6 +4637,10 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-xpu secrets: @@ -2724,6 +4660,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cpu @@ -2745,6 +4685,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu build_environment: linux-binary-manywheel @@ -2767,6 +4711,10 @@ jobs: GPU_ARCH_TYPE: cpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cpu secrets: @@ -2783,15 +4731,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-test: # Testing @@ -2806,15 +4766,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner +======= + runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-upload: # Uploading @@ -2829,10 +4801,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 +======= + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_6 secrets: @@ -2849,15 +4829,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-test: # Testing @@ -2872,15 +4864,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-upload: # Uploading @@ -2895,10 +4899,18 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.8 +======= + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_8 secrets: @@ -2915,15 +4927,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_9 build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_9-test: # Testing @@ -2938,15 +4962,27 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner +======= + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_9-upload: # Uploading @@ -2961,17 +4997,29 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu129 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.9 +======= + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-cuda13_0-build: +======= + manywheel-py3_13t-rocm6_3-build: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2980,6 +5028,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda @@ -3062,6 +5111,24 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: - manywheel-py3_13t-rocm6_4-build +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13t-rocm6_3 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-rocm6_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-rocm6_3-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -3070,6 +5137,7 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: rocm6.4 GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm @@ -3080,18 +5148,35 @@ jobs: permissions: id-token: write contents: read +======= + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.13t" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: manywheel-py3_13t-rocm6_4 +======= + name: manywheel-py3_13t-rocm6_3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3113,7 +5198,124 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 + with: + docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} + docker-image-name: manylinux2_28-builder + custom-tag-prefix: rocm6.3 + docker-build-dir: .ci/docker + working-directory: pytorch + - name: Pull Docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Test Pytorch binary + uses: ./pytorch/.github/actions/test-pytorch-binary + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + manywheel-py3_13t-rocm6_3-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-rocm6_3-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.3 + GPU_ARCH_VERSION: 6.3 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + use_split_build: False + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-rocm6_3 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-rocm6_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13t-rocm6_4 + build_environment: linux-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-rocm6_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-rocm6_4-build + - get-label-type + runs-on: linux.rocm.gpu.mi250 + timeout-minutes: 240 + env: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + SKIP_ALL_TESTS: 1 + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False + DESIRED_PYTHON: "3.13t" + steps: + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: manywheel-py3_13t-rocm6_4 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: configure aws credentials + id: aws_creds + if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3121,7 +5323,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3142,16 +5348,25 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: rocm6.4 +<<<<<<< HEAD GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 +======= + GPU_ARCH_VERSION: 6.4 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3267,6 +5482,8 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_13t-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3280,11 +5497,19 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-xpu build_environment: linux-binary-manywheel +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-xpu-test: # Testing @@ -3304,13 +5529,21 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" permissions: id-token: write contents: read steps: - name: Setup XPU +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/setup-xpu@main +======= + uses: ./.github/actions/setup-xpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -3328,7 +5561,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3339,7 +5575,11 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3347,7 +5587,11 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3371,11 +5615,16 @@ jobs: GPU_ARCH_TYPE: xpu DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: xpu +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13t" build_name: manywheel-py3_13t-xpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_14-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} @@ -4708,3 +6957,5 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml index 4a7ebe8366336..2fcb18482eb99 100644 --- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml @@ -41,12 +41,86 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD +======= + manywheel-py3_9-cpu-s390x-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + use_split_build: False + DESIRED_PYTHON: "3.9" + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + timeout-minutes: 420 + build_name: manywheel-py3_9-cpu-s390x + build_environment: linux-s390x-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cpu-s390x-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_9-cpu-s390x-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cpu-s390x + build_environment: linux-s390x-binary-manywheel + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cpu-s390x-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cpu-s390x-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + use_split_build: False + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cpu-s390x + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) manywheel-py3_10-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -60,6 +134,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" @@ -83,6 +161,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-s390x build_environment: linux-s390x-binary-manywheel @@ -105,6 +187,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cpu-s390x secrets: @@ -124,6 +210,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" @@ -147,6 +237,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-s390x build_environment: linux-s390x-binary-manywheel @@ -169,6 +263,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cpu-s390x secrets: @@ -188,6 +286,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" @@ -211,6 +313,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-s390x build_environment: linux-s390x-binary-manywheel @@ -233,6 +339,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cpu-s390x secrets: @@ -252,6 +362,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" @@ -275,6 +389,10 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-s390x build_environment: linux-s390x-binary-manywheel @@ -297,11 +415,16 @@ jobs: GPU_ARCH_TYPE: cpu-s390x DOCKER_IMAGE: pytorch/manylinuxs390x-builder DOCKER_IMAGE_TAG_PREFIX: cpu-s390x +<<<<<<< HEAD +======= + use_split_build: False +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DESIRED_PYTHON: "3.13" build_name: manywheel-py3_13-cpu-s390x secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD manywheel-py3_13t-cpu-s390x-build: if: ${{ github.repository_owner == 'pytorch' }} @@ -494,3 +617,5 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml index 109e98cd9d91f..86ff96b7a2e0c 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -46,7 +46,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -60,6 +64,7 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Setup Python uses: actions/setup-python@v6 with: @@ -70,6 +75,23 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -80,9 +102,19 @@ jobs: working-directory: pytorch - name: Populate binary env run: | +<<<<<<< HEAD + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index afe9330deb83d..6ac8df02540f4 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -30,6 +30,132 @@ concurrency: cancel-in-progress: true jobs: +<<<<<<< HEAD +======= + wheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-14-xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + set -eux -o pipefail + # shellcheck disable=SC1090 + source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" + mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" + + # Build + USE_PYTORCH_METAL_EXPORT=1 + USE_COREML_DELEGATE=1 + TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}" + export USE_PYTORCH_METAL_EXPORT + export USE_COREML_DELEGATE + export TORCH_PACKAGE_NAME + "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" + - name: Test PyTorch wheel + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + set -eux -o pipefail + # shellcheck disable=SC1090 + source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" + pip uninstall -y "$TORCH_PACKAGE_NAME" || true + pip uninstall -y "$TORCH_PACKAGE_NAME" || true + + # Create new "clean" conda environment for testing + + SMOKE_TEST_PARAMS="" + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env + pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v + + # shellcheck disable=SC2086 + python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS} + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_9-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-cpu-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cpu + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-cpu + use_s3: False + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wheel-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge @@ -56,6 +182,7 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Setup Python uses: actions/setup-python@v6 with: @@ -66,6 +193,23 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -76,9 +220,19 @@ jobs: working-directory: pytorch - name: Populate binary env run: | +<<<<<<< HEAD "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -94,6 +248,11 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -103,10 +262,20 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" +<<<<<<< HEAD # shellcheck disable=SC2086 python -mvenv test_venv source test_venv/bin/activate +======= + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -165,6 +334,7 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Setup Python uses: actions/setup-python@v6 with: @@ -175,6 +345,23 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -185,9 +372,19 @@ jobs: working-directory: pytorch - name: Populate binary env run: | +<<<<<<< HEAD + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -203,6 +400,11 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -212,10 +414,20 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" +<<<<<<< HEAD # shellcheck disable=SC2086 python -mvenv test_venv source test_venv/bin/activate +======= + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -274,6 +486,7 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Setup Python uses: actions/setup-python@v6 with: @@ -284,6 +497,23 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -294,9 +524,19 @@ jobs: working-directory: pytorch - name: Populate binary env run: | +<<<<<<< HEAD "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -312,6 +552,11 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -321,10 +566,20 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" +<<<<<<< HEAD # shellcheck disable=SC2086 python -mvenv test_venv source test_venv/bin/activate +======= + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -383,6 +638,7 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Setup Python uses: actions/setup-python@v6 with: @@ -393,6 +649,23 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -403,9 +676,19 @@ jobs: working-directory: pytorch - name: Populate binary env run: | +<<<<<<< HEAD + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -421,6 +704,11 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -430,10 +718,20 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" +<<<<<<< HEAD # shellcheck disable=SC2086 python -mvenv test_venv source test_venv/bin/activate +======= + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -492,6 +790,7 @@ jobs: echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" # shellcheck disable=SC2129 echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" +<<<<<<< HEAD - name: Setup Python uses: actions/setup-python@v6 with: @@ -502,6 +801,23 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + if [ -d "/Applications/Xcode_14.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + elif [ -d "/Applications/Xcode_13.3.1.app" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -512,9 +828,19 @@ jobs: working-directory: pytorch - name: Populate binary env run: | +<<<<<<< HEAD + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - name: Build PyTorch binary run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -530,6 +856,11 @@ jobs: "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" - name: Test PyTorch wheel run: | +<<<<<<< HEAD +======= + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set -eux -o pipefail # shellcheck disable=SC1090 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" @@ -539,10 +870,20 @@ jobs: # Create new "clean" conda environment for testing SMOKE_TEST_PARAMS="" +<<<<<<< HEAD # shellcheck disable=SC2086 python -mvenv test_venv source test_venv/bin/activate +======= + if [[ $DESIRED_PYTHON == "3.13t" ]]; then + conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge + SMOKE_TEST_PARAMS="--torch-compile-check disabled" + else + conda create -yn "test_conda_env" python="$DESIRED_PYTHON" + fi + conda activate test_conda_env +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v # shellcheck disable=SC2086 @@ -575,6 +916,7 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_14-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: macos-14-xlarge @@ -793,3 +1135,5 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml index 7c26dbc3b9eea..3f5e0579ea009 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml @@ -41,7 +41,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -51,7 +55,11 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -64,7 +72,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Populate binary env shell: cmd @@ -128,7 +140,11 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -141,7 +157,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Populate binary env shell: cmd @@ -201,7 +221,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml index 5e30b66183840..b2e5c084a0213 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml @@ -41,7 +41,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -51,7 +55,11 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -64,7 +72,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Populate binary env shell: cmd @@ -128,7 +140,11 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -141,7 +157,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Populate binary env shell: cmd @@ -201,7 +221,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml index 1368bc942350e..8f5c089c1d67c 100644 --- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml @@ -41,7 +41,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -51,7 +55,11 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -124,7 +132,11 @@ jobs: - wheel-py3_11-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -198,7 +210,11 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -271,7 +287,11 @@ jobs: - wheel-py3_12-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -345,7 +365,11 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -418,7 +442,11 @@ jobs: - wheel-py3_13-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml new file mode 100644 index 0000000000000..33d97946c6dc8 --- /dev/null +++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml @@ -0,0 +1,259 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-libtorch-debug + +on: + push: + branches: + - main + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-libtorch-debug + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 + OS: windows +concurrency: + group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + libtorch-cpu-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: libtorch-cpu-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + libtorch-cpu-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cpu-shared-with-deps-debug-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml index 3ca3364e5de88..131e97fa2b158 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -35,7 +35,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -44,8 +48,13 @@ jobs: libtorch-cpu-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -58,7 +67,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -84,7 +97,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +133,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -160,7 +180,11 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -173,7 +197,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -190,7 +218,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -222,7 +254,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -283,7 +318,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -291,22 +330,35 @@ jobs: libtorch-cuda12_6-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -332,7 +384,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -364,7 +420,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -408,21 +467,33 @@ jobs: - libtorch-cuda12_6-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -439,7 +510,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -471,7 +546,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -527,13 +605,21 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cuda12_6-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -541,22 +627,35 @@ jobs: libtorch-cuda12_8-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -582,7 +681,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -614,7 +717,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -658,21 +764,33 @@ jobs: - libtorch-cuda12_8-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -689,7 +807,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -721,7 +843,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -777,36 +902,61 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cuda12_8-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + libtorch-cuda12_9-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -832,7 +982,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -864,7 +1018,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -884,7 +1041,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: libtorch-cuda13_0-shared-with-deps-debug +======= + name: libtorch-cuda12_9-shared-with-deps-debug +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -902,6 +1063,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -909,20 +1071,38 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + libtorch-cuda12_9-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_9-shared-with-deps-debug-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -939,7 +1119,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -971,7 +1155,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -992,7 +1179,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: libtorch-cuda13_0-shared-with-deps-debug +======= + name: libtorch-cuda12_9-shared-with-deps-debug +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1015,26 +1206,44 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-debug-upload: # Uploading +======= + libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: libtorch-cuda13_0-shared-with-deps-debug-test +======= + needs: libtorch-cuda12_9-shared-with-deps-debug-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" build_name: libtorch-cuda13_0-shared-with-deps-debug +======= + DESIRED_PYTHON: "3.9" + build_name: libtorch-cuda12_9-shared-with-deps-debug +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml new file mode 100644 index 0000000000000..de71e497e7328 --- /dev/null +++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml @@ -0,0 +1,259 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-libtorch-release + +on: + push: + branches: + - main + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-libtorch-release + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 + OS: windows +concurrency: + group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: libtorch-cpu-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + libtorch-cpu-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cpu-shared-with-deps-release-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml index c6d1e2cf3b017..1c93a77015cab 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -35,7 +35,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -44,8 +48,13 @@ jobs: libtorch-cpu-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -58,7 +67,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -84,7 +97,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +133,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -160,7 +180,11 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -173,7 +197,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -190,7 +218,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -222,7 +254,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -283,7 +318,11 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -291,22 +330,35 @@ jobs: libtorch-cuda12_6-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -332,7 +384,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -364,7 +420,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -408,21 +467,33 @@ jobs: - libtorch-cuda12_6-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -439,7 +510,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -471,7 +546,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -527,13 +605,21 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cuda12_6-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -541,22 +627,35 @@ jobs: libtorch-cuda12_8-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -582,7 +681,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -614,7 +717,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -658,21 +764,33 @@ jobs: - libtorch-cuda12_8-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -689,7 +807,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -721,7 +843,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -777,36 +902,61 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build_name: libtorch-cuda12_8-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + libtorch-cuda12_9-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -832,7 +982,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -864,7 +1018,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -884,7 +1041,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: libtorch-cuda13_0-shared-with-deps-release +======= + name: libtorch-cuda12_9-shared-with-deps-release +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -902,6 +1063,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -909,20 +1071,38 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + libtorch-cuda12_9-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_9-shared-with-deps-release-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" +======= + DESIRED_PYTHON: "3.9" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: - name: Display EC2 information shell: bash @@ -939,7 +1119,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -971,7 +1155,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -992,7 +1179,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: libtorch-cuda13_0-shared-with-deps-release +======= + name: libtorch-cuda12_9-shared-with-deps-release +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1015,26 +1206,44 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading +======= + libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: libtorch-cuda13_0-shared-with-deps-release-test +======= + needs: libtorch-cuda12_9-shared-with-deps-release-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason +<<<<<<< HEAD DESIRED_PYTHON: "3.10" build_name: libtorch-cuda13_0-shared-with-deps-release +======= + DESIRED_PYTHON: "3.9" + build_name: libtorch-cuda12_9-shared-with-deps-release +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml index e14cb79c0000e..f085c60d21aeb 100644 --- a/.github/workflows/generated-windows-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -35,17 +35,1204 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD wheel-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + wheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_9-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_9-cpu-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_9-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_9-cuda12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_9-cuda12_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_9-cuda12_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_9-cuda12_6-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda12_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-cuda12_6-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-cuda12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_9-cuda12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_9-cuda12_8 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_9-cuda12_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_9-cuda12_8-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda12_8 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-cuda12_8-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: 12.8 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-cuda12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_9-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_9-cuda12_9 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_9-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_9-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda12_9 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-cuda12_9-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_9-xpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v4.4.0 + if: always() + with: + name: wheel-py3_9-xpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + wheel-py3_9-xpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_9-xpu-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon + shell: bash + run: | + git config --global core.longpaths true + git config --global core.symlinks true + + # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock + # the directory on Windows and prevent GHA from checking out as reported + # in https://github.com/actions/checkout/issues/1018 + git config --global core.fsmonitor false + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + - name: Checkout PyTorch + uses: actions/checkout@v4 + with: + submodules: recursive + path: pytorch + show-progress: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v4.1.7 + name: Download Build Artifacts + with: + name: wheel-py3_9-xpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-xpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-xpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: xpu + GPU_ARCH_TYPE: xpu + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-xpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -80,7 +1267,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -112,7 +1303,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -156,7 +1350,11 @@ jobs: - wheel-py3_10-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -182,7 +1380,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -214,7 +1416,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -279,15 +1484,24 @@ jobs: wheel-py3_10-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -316,7 +1530,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -348,7 +1566,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -392,14 +1613,22 @@ jobs: - wheel-py3_10-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -419,7 +1648,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -451,7 +1684,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -507,7 +1743,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" build_name: wheel-py3_10-cuda12_6 @@ -517,15 +1757,24 @@ jobs: wheel-py3_10-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -554,7 +1803,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -586,7 +1839,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -630,14 +1886,22 @@ jobs: - wheel-py3_10-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -657,7 +1921,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -689,7 +1957,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -745,25 +2016,42 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" build_name: wheel-py3_10-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_10-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + wheel-py3_10-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -792,7 +2080,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -824,7 +2116,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -844,7 +2139,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: wheel-py3_10-cuda13_0 +======= + name: wheel-py3_10-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -862,6 +2161,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_10-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -869,13 +2169,27 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + wheel-py3_10-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_10-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -895,7 +2209,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -927,7 +2245,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -948,7 +2269,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: wheel-py3_10-cuda13_0 +======= + name: wheel-py3_10-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -971,30 +2296,51 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_10-cuda13_0-upload: # Uploading +======= + wheel-py3_10-cuda12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: wheel-py3_10-cuda13_0-test +======= + needs: wheel-py3_10-cuda12_9-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" build_name: wheel-py3_10-cuda13_0 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml wheel-py3_10-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1004,7 +2350,11 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -1030,7 +2380,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1062,7 +2416,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1106,7 +2463,11 @@ jobs: - wheel-py3_10-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1132,7 +2493,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1164,7 +2529,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1229,8 +2597,13 @@ jobs: wheel-py3_11-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1265,7 +2638,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1297,7 +2674,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1341,7 +2721,11 @@ jobs: - wheel-py3_11-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1367,7 +2751,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1399,7 +2787,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1464,15 +2855,24 @@ jobs: wheel-py3_11-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -1501,7 +2901,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1533,7 +2937,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1577,14 +2984,22 @@ jobs: - wheel-py3_11-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -1604,7 +3019,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1636,7 +3055,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1692,7 +3114,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" build_name: wheel-py3_11-cuda12_6 @@ -1702,15 +3128,24 @@ jobs: wheel-py3_11-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -1739,7 +3174,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1771,7 +3210,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1815,14 +3257,22 @@ jobs: - wheel-py3_11-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -1842,7 +3292,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1874,7 +3328,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -1930,25 +3387,42 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" build_name: wheel-py3_11-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_11-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + wheel-py3_11-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -1977,7 +3451,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2009,7 +3487,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2029,7 +3510,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: wheel-py3_11-cuda13_0 +======= + name: wheel-py3_11-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2047,6 +3532,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_11-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -2054,13 +3540,27 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + wheel-py3_11-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_11-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -2080,7 +3580,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2112,7 +3616,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2133,7 +3640,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: wheel-py3_11-cuda13_0 +======= + name: wheel-py3_11-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -2156,30 +3667,51 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_11-cuda13_0-upload: # Uploading +======= + wheel-py3_11-cuda12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: wheel-py3_11-cuda13_0-test +======= + needs: wheel-py3_11-cuda12_9-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" build_name: wheel-py3_11-cuda13_0 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml wheel-py3_11-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2189,7 +3721,11 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2215,7 +3751,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2247,7 +3787,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2291,7 +3834,11 @@ jobs: - wheel-py3_11-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2317,7 +3864,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2349,7 +3900,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2414,8 +3968,13 @@ jobs: wheel-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2450,7 +4009,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2482,7 +4045,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2526,7 +4092,11 @@ jobs: - wheel-py3_12-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2552,7 +4122,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2584,7 +4158,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2649,15 +4226,24 @@ jobs: wheel-py3_12-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -2686,7 +4272,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2718,7 +4308,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2762,14 +4355,22 @@ jobs: - wheel-py3_12-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -2789,7 +4390,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2821,7 +4426,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -2877,7 +4485,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" build_name: wheel-py3_12-cuda12_6 @@ -2887,15 +4499,24 @@ jobs: wheel-py3_12-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -2924,7 +4545,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2956,7 +4581,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3000,14 +4628,22 @@ jobs: - wheel-py3_12-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3027,7 +4663,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3059,7 +4699,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3115,25 +4758,42 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" build_name: wheel-py3_12-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_12-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + wheel-py3_12-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3162,7 +4822,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3194,7 +4858,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3214,7 +4881,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: wheel-py3_12-cuda13_0 +======= + name: wheel-py3_12-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3232,6 +4903,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_12-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -3239,13 +4911,27 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + wheel-py3_12-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_12-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3265,7 +4951,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3297,7 +4987,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3318,7 +5011,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: wheel-py3_12-cuda13_0 +======= + name: wheel-py3_12-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -3341,30 +5038,51 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_12-cuda13_0-upload: # Uploading +======= + wheel-py3_12-cuda12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: wheel-py3_12-cuda13_0-test +======= + needs: wheel-py3_12-cuda12_9-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" build_name: wheel-py3_12-cuda13_0 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml wheel-py3_12-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3374,7 +5092,11 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -3400,7 +5122,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3432,7 +5158,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3476,7 +5205,11 @@ jobs: - wheel-py3_12-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3502,7 +5235,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3534,7 +5271,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3599,8 +5339,13 @@ jobs: wheel-py3_13-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3635,7 +5380,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3667,7 +5416,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3711,7 +5463,11 @@ jobs: - wheel-py3_13-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3737,7 +5493,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3769,7 +5529,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3834,15 +5597,24 @@ jobs: wheel-py3_13-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -3871,7 +5643,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3903,7 +5679,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -3947,14 +5726,22 @@ jobs: - wheel-py3_13-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -3974,7 +5761,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4006,7 +5797,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4062,7 +5856,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" build_name: wheel-py3_13-cuda12_6 @@ -4072,15 +5870,24 @@ jobs: wheel-py3_13-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -4109,7 +5916,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4141,7 +5952,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4185,14 +5999,22 @@ jobs: - wheel-py3_13-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -4212,7 +6034,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4244,7 +6070,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4300,25 +6129,42 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" build_name: wheel-py3_13-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_13-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + wheel-py3_13-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -4347,7 +6193,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4379,7 +6229,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4399,7 +6252,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: wheel-py3_13-cuda13_0 +======= + name: wheel-py3_13-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4417,6 +6274,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_13-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -4424,13 +6282,27 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + wheel-py3_13-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_13-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -4450,7 +6322,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4482,7 +6358,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4503,7 +6382,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: wheel-py3_13-cuda13_0 +======= + name: wheel-py3_13-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4526,30 +6409,51 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_13-cuda13_0-upload: # Uploading +======= + wheel-py3_13-cuda12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: wheel-py3_13-cuda13_0-test +======= + needs: wheel-py3_13-cuda12_9-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" build_name: wheel-py3_13-cuda13_0 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.13" + build_name: wheel-py3_13-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml wheel-py3_13-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4559,7 +6463,11 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -4585,7 +6493,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4617,7 +6529,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4661,7 +6576,11 @@ jobs: - wheel-py3_13-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4687,7 +6606,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4719,7 +6642,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4784,8 +6710,13 @@ jobs: wheel-py3_13t-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4820,7 +6751,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4852,7 +6787,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -4896,7 +6834,11 @@ jobs: - wheel-py3_13t-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4922,7 +6864,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4954,7 +6900,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5019,15 +6968,24 @@ jobs: wheel-py3_13t-cuda12_6-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5056,7 +7014,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5088,7 +7050,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5132,14 +7097,22 @@ jobs: - wheel-py3_13t-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5159,7 +7132,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5191,7 +7168,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5247,7 +7227,11 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu126 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.6" +======= + GPU_ARCH_VERSION: 12.6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" build_name: wheel-py3_13t-cuda12_6 @@ -5257,15 +7241,24 @@ jobs: wheel-py3_13t-cuda12_8-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5294,7 +7287,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5326,7 +7323,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5370,14 +7370,22 @@ jobs: - wheel-py3_13t-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5397,7 +7405,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5429,7 +7441,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5485,25 +7500,42 @@ jobs: # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION DESIRED_CUDA: cu128 +<<<<<<< HEAD GPU_ARCH_VERSION: "12.8" +======= + GPU_ARCH_VERSION: 12.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" build_name: wheel-py3_13t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_13t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + wheel-py3_13t-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + needs: get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5532,7 +7564,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5564,7 +7600,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5584,7 +7623,11 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: +<<<<<<< HEAD name: wheel-py3_13t-cuda13_0 +======= + name: wheel-py3_13t-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5602,6 +7645,7 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_13t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: @@ -5609,13 +7653,27 @@ jobs: - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" timeout-minutes: 360 +======= + wheel-py3_13t-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - wheel-py3_13t-cuda12_9-build + - get-label-type + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5635,7 +7693,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5667,7 +7729,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5688,7 +7753,11 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: +<<<<<<< HEAD name: wheel-py3_13t-cuda13_0 +======= + name: wheel-py3_13t-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -5711,30 +7780,51 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 +<<<<<<< HEAD wheel-py3_13t-cuda13_0-upload: # Uploading +======= + wheel-py3_13t-cuda12_9-upload: # Uploading +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read +<<<<<<< HEAD needs: wheel-py3_13t-cuda13_0-test +======= + needs: wheel-py3_13t-cuda12_9-test +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION +<<<<<<< HEAD DESIRED_CUDA: cu130 GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" build_name: wheel-py3_13t-cuda13_0 +======= + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: 12.9 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.13t" + build_name: wheel-py3_13t-cuda12_9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml wheel-py3_13t-xpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type +<<<<<<< HEAD runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" timeout-minutes: 360 +======= + runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5744,7 +7834,11 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" +<<<<<<< HEAD PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 +======= + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -5770,7 +7864,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5802,7 +7900,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5846,7 +7947,11 @@ jobs: - wheel-py3_13t-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" +<<<<<<< HEAD timeout-minutes: 360 +======= + timeout-minutes: 300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5872,7 +7977,11 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-ssh@main +======= + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5904,7 +8013,10 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: +<<<<<<< HEAD ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) submodules: recursive path: pytorch show-progress: false @@ -5966,6 +8078,7 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +<<<<<<< HEAD wheel-py3_14-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type @@ -8336,3 +10449,5 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml index be19b8f961f4d..1c582731776f0 100644 --- a/.github/workflows/h100-distributed.yml +++ b/.github/workflows/h100-distributed.yml @@ -8,23 +8,33 @@ on: push: tags: - ciflow/h100-distributed/* +<<<<<<< HEAD schedule: - cron: 46 8 * * * # about 1:46am PDT +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -37,7 +47,11 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runner: "linux.c7i.12xlarge" +======= + runner: "linux.12xlarge" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml index c6cc075e6b270..ce2b5f9bdec16 100644 --- a/.github/workflows/inductor-micro-benchmark-x86.yml +++ b/.github/workflows/inductor-micro-benchmark-x86.yml @@ -13,6 +13,7 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read @@ -25,6 +26,18 @@ jobs: with: build-environment: linux-jammy-py3.9-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks +======= +permissions: read-all + +jobs: + linux-jammy-cpu-py3_9-gcc11-inductor-build: + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-py3.9-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Use metal host for benchmark jobs test-matrix: | { include: [ @@ -32,6 +45,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD inductor-micro-benchmark-test: name: inductor-micro-benchmark-test uses: ./.github/workflows/_linux-test.yml @@ -40,5 +54,15 @@ jobs: build-environment: linux-jammy-py3.9-gcc11 docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} +======= + linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + with: + build-environment: linux-jammy-py3.9-gcc11 + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml index a0ae234ab5669..9d23a6d7eb128 100644 --- a/.github/workflows/inductor-micro-benchmark.yml +++ b/.github/workflows/inductor-micro-benchmark.yml @@ -13,14 +13,22 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-default-label-prefix: name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml index 78602e05586b7..8ff8ffd482201 100644 --- a/.github/workflows/inductor-nightly.yml +++ b/.github/workflows/inductor-nightly.yml @@ -16,14 +16,22 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-default-label-prefix: name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -32,6 +40,7 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD nightly-dynamo-benchmarks-build: name: nightly-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml @@ -39,6 +48,15 @@ jobs: with: build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks +======= + linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build: + name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -48,6 +66,7 @@ jobs: { config: "dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, ]} +<<<<<<< HEAD build-additional-packages: "vision audio torchao" secrets: inherit @@ -59,5 +78,17 @@ jobs: build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }} test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }} +======= + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test: + name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml index 628f624240127..51d22211b3124 100644 --- a/.github/workflows/inductor-perf-compare.yml +++ b/.github/workflows/inductor-perf-compare.yml @@ -10,15 +10,23 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -43,7 +51,10 @@ jobs: { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, ]} +<<<<<<< HEAD build-additional-packages: "vision audio fbgemm torchao" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit test: diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml index e16c8be79130d..47daf1a70ad92 100644 --- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml +++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml @@ -48,14 +48,22 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -116,7 +124,10 @@ jobs: { config: "inductor_torchbench_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" }, ]} selected-test-configs: ${{ inputs.benchmark_configs }} +<<<<<<< HEAD build-additional-packages: "vision audio torchao" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index 8209bf053a772..a9100f4862a97 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -2,7 +2,11 @@ name: inductor-perf-nightly-h100 on: schedule: +<<<<<<< HEAD - cron: 15 0 * * 1-6 +======= + - cron: 15 0,4,8,12,16,20 * * 1-6 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - cron: 0 7 * * 0 # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs @@ -58,6 +62,7 @@ on: required: false type: string default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100 +<<<<<<< HEAD pull_request: # Changing these files guarantees that this workflow needs to be run paths: @@ -71,11 +76,23 @@ concurrency: permissions: id-token: write contents: read +======= + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -84,17 +101,26 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD build: name: build +======= + # NB: Keep this in sync with trunk.yml + build: + name: cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100 # or newer GPUs, so it doesn't benefit much from existing compiler cache # from trunk. Also use a memory-intensive runner here because memory is # usually the bottleneck runner: linux.12xlarge.memory +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '9.0' @@ -123,6 +149,7 @@ jobs: { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" }, ]} selected-test-configs: ${{ inputs.benchmark_configs }} +<<<<<<< HEAD build-additional-packages: "vision audio fbgemm torchao" secrets: inherit @@ -131,6 +158,15 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '15 0 * * 1-6' +======= + secrets: inherit + + test-periodically: + name: cuda12.8-py3.10-gcc9-sm90 + uses: ./.github/workflows/_linux-test.yml + needs: build + if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true @@ -144,7 +180,11 @@ jobs: secrets: inherit test-weekly: +<<<<<<< HEAD name: test-weekly +======= + name: cuda12.8-py3.10-gcc9-sm90 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 0' @@ -161,6 +201,7 @@ jobs: secrets: inherit test: +<<<<<<< HEAD name: test uses: ./.github/workflows/_linux-test.yml needs: build @@ -170,6 +211,15 @@ jobs: with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }} +======= + name: cuda12.8-py3.10-gcc9-sm90 + uses: ./.github/workflows/_linux-test.yml + needs: build + if: github.event_name == 'workflow_dispatch' + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image: ${{ needs.build.outputs.docker-image }} test-matrix: ${{ needs.build.outputs.test-matrix }} timeout-minutes: 720 diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml index 81c1c27b76439..3482da7e80639 100644 --- a/.github/workflows/inductor-perf-test-nightly-macos.yml +++ b/.github/workflows/inductor-perf-test-nightly-macos.yml @@ -48,9 +48,12 @@ jobs: { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, +<<<<<<< HEAD { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -63,7 +66,10 @@ jobs: # Same as the build job python-version: 3.12.7 test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }} +<<<<<<< HEAD timeout-minutes: 300 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) disable-monitor: false monitor-log-interval: 15 monitor-data-collect-interval: 4 diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml new file mode 100644 index 0000000000000..25da0dae163d3 --- /dev/null +++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml @@ -0,0 +1,123 @@ +name: inductor-perf-nightly-rocm + +on: + push: + tags: + - ciflow/inductor-perf-test-nightly-rocm/* + schedule: + - cron: 0 7 * * 0 + # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it + # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs + workflow_dispatch: + inputs: + training: + description: Run training (on by default)? + required: false + type: boolean + default: true + inference: + description: Run inference (on by default)? + required: false + type: boolean + default: true + default: + description: Run inductor_default? + required: false + type: boolean + default: false + dynamic: + description: Run inductor_dynamic_shapes? + required: false + type: boolean + default: false + cppwrapper: + description: Run inductor_cpp_wrapper? + required: false + type: boolean + default: false + cudagraphs: + description: Run inductor_cudagraphs? + required: false + type: boolean + default: true + freezing_cudagraphs: + description: Run inductor_cudagraphs with freezing for inference? + required: false + type: boolean + default: false + aotinductor: + description: Run aot_inductor for inference? + required: false + type: boolean + default: false + maxautotune: + description: Run inductor_max_autotune? + required: false + type: boolean + default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + linux-jammy-rocm-py3_10-inductor-benchmark-build: + if: github.repository_owner == 'pytorch' + name: rocm-py3_10-inductor-benchmark-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + test-matrix: | + { include: [ + { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-inductor-benchmark-test: + permissions: + id-token: write + contents: read + name: rocm-py3_10-inductor-benchmark-test + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-inductor-benchmark-build + with: + build-environment: linux-jammy-rocm-py3_10 + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true + docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }} + timeout-minutes: 720 + # Disable monitor in perf tests for more investigation + disable-monitor: true + monitor-log-interval: 10 + monitor-data-collect-interval: 2 + secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml index a7110b0fd9328..ad3514c117daa 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml @@ -43,29 +43,44 @@ on: required: false type: boolean default: false +<<<<<<< HEAD freezing: description: Run freezing? required: false type: boolean default: true +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) benchmark_configs: description: The list of configs used the benchmark required: false type: string +<<<<<<< HEAD default: inductor_huggingface_perf_cpu_x86_zen,inductor_timm_perf_cpu_x86_zen,inductor_torchbench_perf_cpu_x86_zen +======= + default: inductor_huggingface_perf_zen_cpu_x86,inductor_timm_perf_zen_cpu_x86,inductor_torchbench_perf_zen_cpu_x86 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -74,12 +89,18 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD inductor-build: name: inductor-build +======= + linux-jammy-zen-cpu-py3_9-gcc11-inductor-build: + name: linux-jammy-zen-cpu-py3.9-gcc11-inductor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | @@ -96,10 +117,29 @@ jobs: { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" }, { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" }, { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" }, +======= + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + test-matrix: | + { include: [ + { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" }, + { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 2, num_shards: 3, runner: "linux.24xlarge.amd" }, + { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 3, num_shards: 3, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 1, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 2, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 3, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 4, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_timm_perf_zen_cpu_x86", shard: 5, num_shards: 5, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 1, num_shards: 4, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" }, + { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} selected-test-configs: ${{ inputs.benchmark_configs }} secrets: inherit +<<<<<<< HEAD inductor-test-nightly: name: inductor-test-nightly uses: ./.github/workflows/_linux-test.yml @@ -110,6 +150,18 @@ jobs: dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} +======= + linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly: + name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + if: github.event.schedule == '0 7 * * *' + with: + build-environment: linux-jammy-py3.9-gcc11-build + dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true + docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false @@ -117,6 +169,7 @@ jobs: monitor-data-collect-interval: 4 secrets: inherit +<<<<<<< HEAD inductor-test: name: inductor-test uses: ./.github/workflows/_linux-test.yml @@ -126,6 +179,19 @@ jobs: dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }} docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} +======= + + linux-jammy-zen-cpu-py3_9-gcc11-inductor-test: + name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + if: github.event_name == 'workflow_dispatch' + with: + build-environment: linux-jammy-py3.9-gcc11-build + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} + docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml index 0533184df2e0e..5ce608d6582be 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86.yml @@ -1,9 +1,12 @@ name: inductor-perf-nightly-x86 on: +<<<<<<< HEAD pull_request: paths: - .github/workflows/inductor-perf-test-nightly-x86.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) schedule: # - cron: 0 7 * * 1-6 # - cron: 0 7 * * 0 @@ -43,11 +46,14 @@ on: required: false type: boolean default: false +<<<<<<< HEAD freezing: description: Run freezing? required: false type: boolean default: true +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) benchmark_configs: description: The list of configs used the benchmark required: false @@ -55,17 +61,28 @@ on: default: inductor_huggingface_perf_cpu_x86,inductor_timm_perf_cpu_x86,inductor_torchbench_perf_cpu_x86 concurrency: +<<<<<<< HEAD group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true permissions: id-token: write contents: read +======= + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -74,14 +91,24 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD inductor-build: name: inductor-build +======= + linux-jammy-cpu-py3_9-gcc11-inductor-build: + name: linux-jammy-cpu-py3.9-gcc11-inductor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks +======= + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" }, @@ -98,6 +125,7 @@ jobs: { config: "inductor_torchbench_perf_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xl.spr-metal" }, ]} selected-test-configs: ${{ inputs.benchmark_configs }} +<<<<<<< HEAD build-additional-packages: "vision audio torchao" secrets: inherit @@ -111,6 +139,21 @@ jobs: dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} +======= + secrets: inherit + + + linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + if: github.event.schedule == '0 7 * * *' + with: + build-environment: linux-jammy-py3.9-gcc11-build + dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false @@ -118,6 +161,7 @@ jobs: monitor-data-collect-interval: 4 secrets: inherit +<<<<<<< HEAD inductor-test: name: inductor-test uses: ./.github/workflows/_linux-test.yml @@ -128,6 +172,19 @@ jobs: dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }} docker-image: ${{ needs.inductor-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} +======= + + linux-jammy-cpu-py3_9-gcc11-inductor-test: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + if: github.event_name == 'workflow_dispatch' + with: + build-environment: linux-jammy-py3.9-gcc11-build + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 19f72ba453414..9bc518ede20e8 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -63,14 +63,22 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -79,14 +87,21 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD +======= + # NB: Keep this in sync with trunk.yml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build: name: cuda12.8-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD # Every bit to make perf run faster helps runner: linux.12xlarge.memory +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.0' @@ -113,7 +128,10 @@ jobs: { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, ]} selected-test-configs: ${{ inputs.benchmark_configs }} +<<<<<<< HEAD build-additional-packages: "vision audio fbgemm torchao" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit test-nightly: diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index b08d9865d15d3..13949b3a3f651 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -15,14 +15,22 @@ concurrency: cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-default-label-prefix: name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -31,15 +39,24 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD periodic-dynamo-benchmarks-build: name: periodic-dynamo-benchmarks-build +======= + linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build: + name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks +<<<<<<< HEAD cuda-arch-list: '8.0;8.6' +======= + cuda-arch-list: '8.6' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, @@ -57,6 +74,7 @@ jobs: { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, +<<<<<<< HEAD { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, @@ -133,6 +151,64 @@ jobs: inductor-smoke-build: name: inductor-smoke-build +======= + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test: + name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build: + if: github.repository_owner == 'pytorch' + name: rocm-py3_10-periodic-dynamo-benchmarks + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" }, + { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test: + permissions: + id-token: write + contents: read + name: rocm-py3_10-periodic-dynamo-benchmarks + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build: + name: cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix @@ -145,6 +221,7 @@ jobs: { include: [ { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, ]} +<<<<<<< HEAD build-additional-packages: "vision audio fbgemm torchao" secrets: inherit @@ -165,6 +242,27 @@ jobs: with: build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks +======= + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build: + name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -179,6 +277,69 @@ jobs: { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, +<<<<<<< HEAD +======= + ]} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test: + name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + secrets: inherit + + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks + cuda-arch-list: '8.6' + runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" + sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + test-matrix: | + { include: [ + { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-inductor-build: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" + sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build + test-matrix: | + { include: [ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, @@ -201,6 +362,7 @@ jobs: { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, ]} +<<<<<<< HEAD build-additional-packages: "vision audio torchao" secrets: inherit @@ -212,4 +374,16 @@ jobs: build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }} +======= + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-inductor-test: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index 732ec7eb85f3e..2a13d38059c3f 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -28,7 +28,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -47,8 +51,13 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ +<<<<<<< HEAD { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, +======= + { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml index b1bb7972d67de..64b3e12cfca09 100644 --- a/.github/workflows/inductor-rocm.yml +++ b/.github/workflows/inductor-rocm.yml @@ -7,6 +7,10 @@ on: - release/* tags: - ciflow/inductor-rocm/* +<<<<<<< HEAD +======= + - ciflow/inductor/* +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: concurrency: @@ -20,7 +24,11 @@ permissions: jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index 6ab276a57fc4d..618b97c774973 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -12,14 +12,22 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-unittest cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -28,8 +36,13 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD inductor-build: name: inductor-build +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm86 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -47,6 +60,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD inductor-test: name: inductor-test uses: ./.github/workflows/_linux-test.yml @@ -59,6 +73,46 @@ jobs: inductor-halide-build: name: inductor-halide-build +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_12-gcc9-inductor-build: + name: cuda12.8-py3.12-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks + cuda-arch-list: '8.6' + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + test-matrix: | + { include: [ + { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_12-gcc9-inductor-test: + name: cuda12.8-py3.12-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_12-inductor-halide-build: + name: linux-jammy-cpu-py3.12-gcc11-inductor-halide +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -71,6 +125,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD inductor-halide-test: name: inductor-halide-test uses: ./.github/workflows/_linux-test.yml @@ -83,6 +138,20 @@ jobs: inductor-triton-cpu-build: name: inductor-triton-cpu-build +======= + linux-jammy-cpu-py3_12-inductor-halide-test: + name: linux-jammy-cpu-py3.12-gcc11-inductor-halide + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_12-inductor-halide-build + with: + build-environment: linux-jammy-py3.12-gcc11 + docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_12-inductor-triton-cpu-build: + name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -95,6 +164,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD inductor-triton-cpu-test: name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu uses: ./.github/workflows/_linux-test.yml @@ -112,6 +182,25 @@ jobs: with: build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks +======= + linux-jammy-cpu-py3_12-inductor-triton-cpu-test: + name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build + with: + build-environment: linux-jammy-py3.12-gcc11 + docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-inductor-build: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ @@ -122,6 +211,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD inductor-cpu-test: name: inductor-cpu-test uses: ./.github/workflows/_linux-test.yml @@ -130,4 +220,39 @@ jobs: build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} +======= + linux-jammy-cpu-py3_9-gcc11-inductor-test: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_13-gcc9-inductor-build: + name: cuda12.8-py3.13-gcc9-sm86 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks + cuda-arch-list: '8.6' + test-matrix: | + { include: [ + { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_13-gcc9-inductor-test: + name: cuda12.8-py3.13-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 2616141c0dc2a..80b1354cc6988 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -22,9 +22,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: unit-test: @@ -35,7 +39,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -44,8 +52,13 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf +<<<<<<< HEAD inductor-build: name: inductor-build +======= + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm86 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -53,6 +66,10 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD +======= + sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, @@ -61,6 +78,7 @@ jobs: { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, ]} +<<<<<<< HEAD build-additional-packages: "vision audio fbgemm torchao" secrets: inherit @@ -82,6 +100,29 @@ jobs: build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +======= + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm86 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-inductor-build: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, @@ -93,6 +134,7 @@ jobs: { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" }, ]} +<<<<<<< HEAD build-additional-packages: "vision audio torchao" secrets: inherit @@ -104,4 +146,16 @@ jobs: build-environment: linux-jammy-py3.10-gcc11-build docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} +======= + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-inductor-test: + name: linux-jammy-cpu-py3.9-gcc11-inductor + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml index b962970dc5b78..e1eb6a7ee1a6e 100644 --- a/.github/workflows/lint-autoformat.yml +++ b/.github/workflows/lint-autoformat.yml @@ -13,7 +13,11 @@ jobs: if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }} steps: - name: Checkout pytorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml index e0de9ede35084..d2cfc48fe3a37 100644 --- a/.github/workflows/lint-bc.yml +++ b/.github/workflows/lint-bc.yml @@ -20,7 +20,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Run BC Lint Action +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/bc-lint@main +======= + uses: pytorch/test-infra/.github/actions/bc-lint@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repo: ${{ github.event.pull_request.head.repo.full_name }} base_sha: ${{ github.event.pull_request.base.sha }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 729b111574851..f46d53d2100a9 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -12,7 +12,10 @@ on: - landchecks/* tags: - ciflow/pull/* +<<<<<<< HEAD - ciflow/trunk/* +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: permissions: read-all @@ -22,12 +25,17 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} +<<<<<<< HEAD get-changed-files: if: github.repository_owner == 'pytorch' name: Get changed files @@ -59,12 +67,22 @@ jobs: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter +======= + lintrunner-clang: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + needs: get-label-type + with: + timeout: 120 + runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 submodules: true ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | +<<<<<<< HEAD CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}" if [ "$CHANGED_FILES" = "*" ]; then export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files" @@ -109,12 +127,26 @@ jobs: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" docker-image: ci-image:pytorch-linux-jammy-linter +======= + export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files" + export CLANG=1 + .github/scripts/lintrunner.sh + + lintrunner-noclang: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + needs: get-label-type + with: + timeout: 120 + runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 submodules: true ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | +<<<<<<< HEAD CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}" echo "Running all other linters" if [ "$CHANGED_FILES" = '*' ]; then @@ -125,6 +157,13 @@ jobs: quick-checks: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main +======= + export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files" + .github/scripts/lintrunner.sh + + quick-checks: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 @@ -164,7 +203,11 @@ jobs: if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks') steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: -1 @@ -177,7 +220,11 @@ jobs: bash .github/scripts/pr-sanity-check.sh workflow-checks: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 @@ -188,6 +235,10 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | # Regenerate workflows +<<<<<<< HEAD +======= + export RELEASE_VERSION_TAG=2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .github/scripts/generate_ci_workflows.py RC=0 @@ -211,7 +262,11 @@ jobs: exit $RC toc: +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 @@ -247,7 +302,11 @@ jobs: test-tools: name: Test tools if: ${{ github.repository == 'pytorch/pytorch' }} +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) needs: get-label-type with: timeout: 120 @@ -267,6 +326,7 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: submodules: false @@ -275,6 +335,16 @@ jobs: uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: '3.10' +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 + with: + submodules: false + fetch-depth: 1 + - name: Setup Python 3.9 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: '3.9' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) architecture: x64 cache: pip - name: Install dependencies @@ -304,7 +374,11 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required, to allow us to use git log - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 @@ -324,7 +398,10 @@ jobs: check-latest: false cache: pip cache-dependency-path: | +<<<<<<< HEAD **/requirements-build.txt +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) **/requirements.txt - name: Setup Min Python version if: matrix.test_type != 'older_python_version' @@ -335,7 +412,10 @@ jobs: check-latest: false cache: pip cache-dependency-path: | +<<<<<<< HEAD **/requirements-build.txt +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) **/requirements.txt - name: Install torch if: matrix.test_type == 'with_torch' diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml index 2b840a39a5c21..456f931527369 100644 --- a/.github/workflows/linux-aarch64.yml +++ b/.github/workflows/linux-aarch64.yml @@ -19,7 +19,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml index 565a9b25df50f..512c60abcc99f 100644 --- a/.github/workflows/llm_td_retrieval.yml +++ b/.github/workflows/llm_td_retrieval.yml @@ -12,7 +12,11 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -116,5 +120,9 @@ jobs: AWS_REGION: "" - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml index c80599fe89988..87477dda1e2e3 100644 --- a/.github/workflows/mac-mps.yml +++ b/.github/workflows/mac-mps.yml @@ -28,6 +28,10 @@ jobs: # than our AWS macos-m1-14 runners test-matrix: | { include: [ +<<<<<<< HEAD +======= + { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" }, ]} diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml index acf3504dec9ca..d9e462b337bde 100644 --- a/.github/workflows/nightly-s3-uploads.yml +++ b/.github/workflows/nightly-s3-uploads.yml @@ -23,7 +23,11 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 696c5b68b475b..2299dc8c63342 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -19,7 +19,11 @@ concurrency: jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -42,8 +46,13 @@ jobs: needs: get-label-type with: runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 +======= + build-environment: linux-jammy-py3.9-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit docs-push: @@ -54,7 +63,11 @@ jobs: - get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11 +======= + build-environment: linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image: ${{ needs.docs-build.outputs.docker-image }} push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} run-doxygen: true @@ -75,24 +88,38 @@ jobs: repo-owner: pytorch branch: main pin-folder: .github/ci_commit_pins +<<<<<<< HEAD # executorch jobs are disabled since it needs some manual work for the hash update # - repo-name: executorch # repo-owner: pytorch # branch: main # pin-folder: .ci/docker/ci_commit_pins +======= + - repo-name: executorch + repo-owner: pytorch + branch: main + pin-folder: .ci/docker/ci_commit_pins +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - repo-name: triton repo-owner: triton-lang branch: main pin-folder: .ci/docker/ci_commit_pins +<<<<<<< HEAD - repo-name: vllm repo-owner: vllm-project branch: main pin-folder: .github/ci_commit_pins +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') steps: - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash" +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/update-commit-hash@main +======= + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repo-owner: ${{ matrix.repo-owner }} repo-name: ${{ matrix.repo-name }} diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml index 40bd245ce913f..a996318343a9b 100644 --- a/.github/workflows/nitpicker.yml +++ b/.github/workflows/nitpicker.yml @@ -19,7 +19,11 @@ jobs: if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - uses: ethanis/nitpicker@v1 with: nitpicks: '.github/nitpicks.yml' diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index 40fb3b8d0c85f..fc88b2ce5ea19 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -7,24 +7,34 @@ on: workflow_dispatch: inputs: test_mode: +<<<<<<< HEAD type: choice options: - 'short' - 'long' - 'all' +======= + required: false + type: string + default: 'short' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) description: tag filter for operator benchmarks, options from long, short, all schedule: # Run at 07:00 UTC every Sunday - cron: 0 7 * * 0 +<<<<<<< HEAD pull_request: paths: - benchmarks/operator_benchmark/** - .github/workflows/operator_benchmark.yml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read @@ -75,4 +85,43 @@ jobs: build-environment: linux-jammy-aarch64-py3.10 docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }} test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }} +======= +permissions: read-all + +jobs: + linux-jammy-cpu-py3_9-gcc11-opbenchmark-build: + if: github.repository_owner == 'pytorch' + name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + test-matrix: | + { include: [ + { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, + ]} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build: + if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }} + name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + test-matrix: | + { include: [ + { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, + ]} + secrets: inherit + + linux-jammy-cpu-py3_9-gcc11-opbenchmark-test: + name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build + with: + build-environment: linux-jammy-py3.9-gcc11-build + docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml index 4d8890e69fc73..cce4798eb8ee8 100644 --- a/.github/workflows/periodic-rocm-mi300.yml +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -41,7 +41,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} @@ -59,9 +63,15 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ +<<<<<<< HEAD { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, +======= + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 0c4668aa89c6b..70cd2683e3498 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -20,9 +20,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: llm-td: @@ -43,7 +47,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} @@ -51,6 +59,7 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-jammy-cuda12_4-py3_10-gcc11-build: name: linux-jammy-cuda12.4-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml @@ -82,6 +91,8 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-cuda12_8-py3_10-gcc11-build: name: linux-jammy-cuda12.8-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml @@ -113,13 +124,22 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-jammy-cuda12_8-py3_10-gcc9-build: name: linux-jammy-cuda12.8-py3.10-gcc9 +======= + linux-jammy-cuda12_8-py3_9-gcc9-build: + name: linux-jammy-cuda12.8-py3.9-gcc9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-cuda12.8-py3.10-gcc9 +======= + build-environment: linux-jammy-cuda12.8-py3.9-gcc9 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9 cuda-arch-list: 8.6 test-matrix: | @@ -127,6 +147,7 @@ jobs: { config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] }, { config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] }, ]} +<<<<<<< HEAD secrets: inherit linux-jammy-cuda12_8-py3_10-gcc9-test: @@ -137,6 +158,19 @@ jobs: build-environment: linux-jammy-cuda12.8-py3.10-gcc9 docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }} +======= + build-with-debug: false + secrets: inherit + + linux-jammy-cuda12_8-py3_9-gcc9-test: + name: linux-jammy-cuda12.8-py3.9-gcc9 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_9-gcc9-build + with: + build-environment: linux-jammy-cuda12.8-py3.9-gcc9 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-cuda12_8-py3_10-gcc9-debug-build: @@ -147,6 +181,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9 +<<<<<<< HEAD cuda-arch-list: 8.9 test-matrix: | { include: [ @@ -157,6 +192,18 @@ jobs: { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] }, { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] }, { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] }, +======= + build-with-debug: true + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, + { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, + { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, + { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, + { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, + { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, + { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -172,6 +219,7 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-jammy-cuda13_0-py3_10-gcc11-build: name: linux-jammy-cuda13.0-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml @@ -204,6 +252,8 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-rocm-py3_10-build: name: linux-jammy-rocm-py3.10 uses: ./.github/workflows/_linux-build.yml @@ -214,9 +264,15 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ +<<<<<<< HEAD { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, +======= + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 7fdfab476705b..3ded657422413 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -19,9 +19,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: llm-td: @@ -42,21 +46,35 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} +<<<<<<< HEAD linux-jammy-py3_10-gcc11-build: name: linux-jammy-py3.10-gcc11 +======= + linux-jammy-py3_9-gcc11-build: + name: linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 +======= + build-environment: linux-jammy-py3.9-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -73,6 +91,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-gcc11-test: name: linux-jammy-py3.10-gcc11 uses: ./.github/workflows/_linux-test.yml @@ -83,11 +102,24 @@ jobs: build-environment: linux-jammy-py3.10-gcc11 docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }} +======= + linux-jammy-py3_9-gcc11-test: + name: linux-jammy-py3.9-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-gcc11-build + - target-determination + with: + build-environment: linux-jammy-py3.9-gcc11 + docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-docs: name: linux-docs uses: ./.github/workflows/_docs.yml +<<<<<<< HEAD needs: linux-jammy-py3_10-gcc11-build with: build-environment: linux-jammy-py3.10-gcc11 @@ -96,26 +128,51 @@ jobs: linux-jammy-py3_10-gcc11-no-ops: name: linux-jammy-py3.10-gcc11-no-ops +======= + needs: linux-jammy-py3_9-gcc11-build + with: + build-environment: linux-jammy-py3.9-gcc11 + docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} + secrets: inherit + + linux-jammy-py3_9-gcc11-no-ops: + name: linux-jammy-py3.9-gcc11-no-ops +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11-no-ops docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 +======= + build-environment: linux-jammy-py3.9-gcc11-no-ops + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-gcc11-pch: name: linux-jammy-py3.10-gcc11-pch +======= + linux-jammy-py3_9-gcc11-pch: + name: linux-jammy-py3.9-gcc11-pch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11-pch docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 +======= + build-environment: linux-jammy-py3.9-gcc11-pch + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -127,12 +184,16 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: +<<<<<<< HEAD runner: linux.2xlarge.memory +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan test-matrix: | { include: [ +<<<<<<< HEAD { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -140,10 +201,22 @@ jobs: { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, +======= + { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} sync-tag: asan-build secrets: inherit +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-py3_10-clang18-asan-test: name: linux-jammy-py3.10-clang18-asan uses: ./.github/workflows/_linux-test.yml @@ -157,13 +230,22 @@ jobs: sync-tag: asan-test secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang12-onnx-build: name: linux-jammy-py3.10-clang12-onnx +======= + linux-jammy-py3_9-clang12-onnx-build: + name: linux-jammy-py3.9-clang12-onnx +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-clang12-onnx +======= + build-environment: linux-jammy-py3.9-clang12-onnx +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx test-matrix: | { include: [ @@ -172,6 +254,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang12-onnx-test: name: linux-jammy-py3.10-clang12-onnx uses: ./.github/workflows/_linux-test.yml @@ -186,12 +269,33 @@ jobs: linux-jammy-py3_10-clang12-build: name: linux-jammy-py3.10-clang12 +======= + linux-jammy-py3_9-clang12-onnx-test: + name: linux-jammy-py3.9-clang12-onnx + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-clang12-onnx-build + - target-determination + with: + build-environment: linux-jammy-py3.9-clang12-onnx + docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3_9-clang12-build: + name: linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-clang12 docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12 +======= + build-environment: linux-jammy-py3.9-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -208,6 +312,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang12-test: name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-test.yml @@ -218,6 +323,18 @@ jobs: build-environment: linux-jammy-py3.10-clang12 docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} +======= + linux-jammy-py3_9-clang12-test: + name: linux-jammy-py3.9-clang12 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-clang12-build + - target-determination + with: + build-environment: linux-jammy-py3.9-clang12 + docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-py3_13-clang12-build: @@ -252,22 +369,138 @@ jobs: build-environment: linux-jammy-py3.13-clang12 docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }} +<<<<<<< HEAD secrets: inherit linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build: name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12 +======= + timeout-minutes: 600 + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-build-distributed: + name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12 +======= + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '7.5' + test-matrix: | + { include: [ + { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-test-distributed: + name: linux-jammy-cuda12.8-py3.10-gcc11-test + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed + - target-determination + with: + timeout-minutes: 360 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-build: + name: linux-jammy-cuda12.8-py3.10-gcc11 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-test: + name: linux-jammy-cuda12.8-py3.10-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build + - target-determination + with: + timeout-minutes: 360 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3-clang12-mobile-build: + name: linux-jammy-py3-clang12-mobile-build + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3-clang12-mobile-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang15-asan + build-generates-artifacts: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 1 }, + ]} + secrets: inherit + +<<<<<<< HEAD +======= + linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build: + name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit + linux-jammy-py3_9-clang9-xla-build: + name: linux-jammy-py3_9-clang9-xla + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3.9-clang9-xla + docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite + test-matrix: | + { include: [ + { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, + ]} + secrets: inherit + + linux-jammy-py3_9-clang9-xla-test: + name: linux-jammy-py3_9-clang9-xla + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3_9-clang9-xla-build + with: + build-environment: linux-jammy-py3.9-clang9-xla + docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} + secrets: inherit + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-cpu-py3_10-gcc11-bazel-test: name: linux-jammy-cpu-py3.10-gcc11-bazel-test uses: ./.github/workflows/_bazel-build-test.yml @@ -283,14 +516,24 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build: name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build +======= + linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build: + name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 +======= + build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-generates-artifacts: false test-matrix: | { include: [ @@ -317,6 +560,65 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD +======= + linux-jammy-cuda12_8-py3_10-gcc11-sm89-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: 8.9 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm89-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build + - target-determination + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3-clang12-executorch-build: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3-clang12-executorch + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch + test-matrix: | + { include: [ + { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + ]} + secrets: inherit + + linux-jammy-py3-clang12-executorch-test: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3-clang12-executorch-build + if: false # Has been broken for a while + with: + build-environment: linux-jammy-py3-clang12-executorch + docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} + secrets: inherit + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: name: cuda12.8-py3.10-gcc9-sm75 uses: ./.github/workflows/_linux-build.yml @@ -342,6 +644,7 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-jammy-xpu-n-py3_10-build: name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml @@ -352,6 +655,17 @@ jobs: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-xpu-n-py3.10 docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 +======= + linux-jammy-xpu-2025_1-py3_9-build: + name: linux-jammy-xpu-2025.1-py3.9 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + sync-tag: linux-xpu-2025-1-build + runner_prefix: ${{ needs.get-label-type.outputs.label-type }} + build-environment: linux-jammy-xpu-2025.1-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" }, diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml index 226d773e48977..476691f003e69 100644 --- a/.github/workflows/revert.yml +++ b/.github/workflows/revert.yml @@ -26,7 +26,11 @@ jobs: architecture: x64 check-latest: false cache: pip +<<<<<<< HEAD - run: pip install pyyaml==6.0.2 +======= + - run: pip install pyyaml==6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup committer id run: | diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index c50111d068d24..374751730db1f 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -28,7 +28,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -36,13 +40,20 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-noble-rocm-py3_12-build: if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} name: linux-noble-rocm-py3.12-mi300 +======= + linux-jammy-rocm-py3_10-build: + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + name: linux-jammy-rocm-py3.10-mi300 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-noble-rocm-py3.12-mi300 docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3 test-matrix: | @@ -69,4 +80,33 @@ jobs: build-environment: linux-noble-rocm-py3.12-mi300 docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }} +======= + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3.10-mi300 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10-mi300 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 50a791432dc97..706904843cd74 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -26,6 +26,7 @@ jobs: id-token: write contents: read +<<<<<<< HEAD get-label-type: name: get-label-type uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main @@ -36,13 +37,19 @@ jobs: curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) linux-jammy-rocm-py3_10-build: if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} name: linux-jammy-rocm-py3.10 uses: ./.github/workflows/_linux-build.yml +<<<<<<< HEAD needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +======= + with: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-environment: linux-jammy-rocm-py3.10 docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 sync-tag: rocm-build diff --git a/.github/workflows/s390x-periodic.yml b/.github/workflows/s390x-periodic.yml index 405e3e1a581cc..2723fa23dc2ba 100644 --- a/.github/workflows/s390x-periodic.yml +++ b/.github/workflows/s390x-periodic.yml @@ -15,9 +15,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: llm-td: diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index d4992a2ddb2cf..b8c4cc8ba2dca 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -18,9 +18,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: llm-td: @@ -41,7 +45,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -78,14 +86,24 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang12-build: name: linux-jammy-py3.10-clang12 +======= + linux-jammy-py3_9-clang12-build: + name: linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-clang12 docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12 +======= + build-environment: linux-jammy-py3.9-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, @@ -93,6 +111,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-py3_10-clang12-test: name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-test.yml @@ -103,6 +122,18 @@ jobs: build-environment: linux-jammy-py3.10-clang12 docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} +======= + linux-jammy-py3_9-clang12-test: + name: linux-jammy-py3.9-clang12 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-py3_9-clang12-build + - target-determination + with: + build-environment: linux-jammy-py3.9-clang12 + docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secrets: inherit linux-jammy-rocm-py3_10-build: @@ -140,7 +171,10 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: +<<<<<<< HEAD runner: linux.2xlarge.memory +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml index ec579fda8da94..216550ba0f38e 100644 --- a/.github/workflows/target-determination-indexer.yml +++ b/.github/workflows/target-determination-indexer.yml @@ -13,7 +13,11 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -35,7 +39,11 @@ jobs: - name: Calculate docker image id: calculate-docker-image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/calculate-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 working-directory: pytorch @@ -50,13 +58,21 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/pull-docker-image@main +======= + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/setup-nvidia@main +======= + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Clone CodeLlama uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -149,7 +165,11 @@ jobs: "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}" - name: Teardown Linux +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/teardown-linux@main +======= + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: always() concurrency: diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml index c712b11185a76..273776839c6c1 100644 --- a/.github/workflows/target_determination.yml +++ b/.github/workflows/target_determination.yml @@ -9,7 +9,11 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -27,7 +31,11 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml index 5f0ad59d3a3bb..1291f7fa1deee 100644 --- a/.github/workflows/test-check-binary.yml +++ b/.github/workflows/test-check-binary.yml @@ -15,7 +15,11 @@ jobs: check_binary_linux_cpu: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CPU +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: docker-image: python:3.11 docker-build-dir: "skip-docker-build" @@ -28,9 +32,15 @@ jobs: check_binary_linux_cuda: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CUDA +<<<<<<< HEAD uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g4dn.4xlarge.nvidia.gpu +======= + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8 + with: + runner: linux.4xlarge.nvidia.gpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image: python:3.11 docker-build-dir: "skip-docker-build" script: | diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index ec99f4473bb0b..43335eb2ceb7b 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -4,10 +4,13 @@ on: pull_request: paths: - .github/workflows/test-h100.yml +<<<<<<< HEAD - test/inductor/test_max_autotune.py - torch/_inductor/kernel/mm.py - torch/_inductor/kernel/mm_grouped.py +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) workflow_dispatch: schedule: - cron: 0 4,10,16,22 * * * # every 6 hours @@ -19,16 +22,23 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -41,7 +51,11 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runner: linux.12xlarge.memory +======= + runner: "linux.12xlarge" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' @@ -61,6 +75,7 @@ jobs: docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }} secrets: inherit +<<<<<<< HEAD linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test: name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test @@ -73,3 +88,5 @@ jobs: timeout-minutes: 30 s3-bucket: gha-artifacts secrets: inherit +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml index 08fcd33402625..c4694fe32d785 100644 --- a/.github/workflows/torchbench.yml +++ b/.github/workflows/torchbench.yml @@ -10,15 +10,22 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/trunk-tagging.yml b/.github/workflows/trunk-tagging.yml index d96f2de8366aa..cc024b89cbdd9 100644 --- a/.github/workflows/trunk-tagging.yml +++ b/.github/workflows/trunk-tagging.yml @@ -58,10 +58,15 @@ jobs: else COMMIT_SHA="${{ github.sha }}" fi +<<<<<<< HEAD { echo "sha=${COMMIT_SHA}" echo "tag_name=trunk/${COMMIT_SHA}" } >> "${GITHUB_OUTPUT}" +======= + echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}" + echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Validate commit SHA run: | @@ -89,7 +94,11 @@ jobs: echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)" fi +<<<<<<< HEAD - name: Create and push tag(s) with retry +======= + - name: Create and push tag with retry +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: check_tag env: TAG_NAME: ${{ steps.commit.outputs.tag_name }} @@ -114,6 +123,7 @@ jobs: return 1 } +<<<<<<< HEAD # Counters for summary reporting created_count=0 skipped_count=0 @@ -131,6 +141,16 @@ jobs: fi } trap finish EXIT +======= + # Exit early if tag already exists + if check_tag_exists; then + echo "✅ Tag already exists - no action needed" + echo "exists=true" >> "${GITHUB_OUTPUT}" + exit 0 + fi + + echo "Tag ${TAG_NAME} does not exist, proceeding with creation" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Retry configuration MAX_RETRIES=5 @@ -205,6 +225,7 @@ jobs: } } +<<<<<<< HEAD # New behavior for push events: enumerate commits in the push and tag each one. # For workflow_dispatch, retain existing single-SHA behavior. @@ -278,11 +299,21 @@ jobs: failed_count=1 exit 1 fi +======= + # Execute with retry + if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then + echo "exists=false" >> "${GITHUB_OUTPUT}" + exit 0 + else + echo "Tag creation failed after all retry attempts" + exit 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi - name: Tag creation summary if: always() run: | +<<<<<<< HEAD if [ "${{ github.event_name }}" = "push" ]; then echo "Trigger: push on main" echo "Created: ${{ steps.check_tag.outputs.created_count }}" @@ -312,4 +343,21 @@ jobs: if [ -n "${{ github.event.inputs.commit_sha }}" ]; then echo " Manual commit: ${{ github.event.inputs.commit_sha }}" fi +======= + if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then + echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed" + elif [ "${{ job.status }}" = "success" ]; then + echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}" + else + echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}" + fi + + echo "" + echo "Tag details:" + echo " Name: ${{ steps.commit.outputs.tag_name }}" + echo " Commit: ${{ steps.commit.outputs.sha }}" + echo " Trigger: ${{ github.event_name }}" + if [ -n "${{ github.event.inputs.commit_sha }}" ]; then + echo " Manual commit: ${{ github.event.inputs.commit_sha }}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 48d1c4490d726..8d3b9d1fbe956 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -16,9 +16,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: llm-td: @@ -39,7 +43,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -56,13 +64,18 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 build-generates-artifacts: false runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD runner: "linux.c7i.4xlarge" +======= + runner: "linux.4xlarge" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit +<<<<<<< HEAD linux-jammy-cuda12_8-py3_10-gcc11-build: name: linux-jammy-cuda12.8-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml @@ -100,6 +113,8 @@ jobs: secrets: inherit +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build: name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops @@ -131,6 +146,10 @@ jobs: { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" }, { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" }, { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" }, +<<<<<<< HEAD +======= + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" }, ]} @@ -160,10 +179,16 @@ jobs: runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" test-matrix: | { include: [ +<<<<<<< HEAD { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, +======= + { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, + { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, + { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -180,6 +205,7 @@ jobs: disable-monitor: false secrets: inherit +<<<<<<< HEAD win-vs2022-cuda12_8-py3-build: name: win-vs2022-cuda12.8-py3 uses: ./.github/workflows/_win-build.yml @@ -187,6 +213,15 @@ jobs: with: build-environment: win-vs2022-cuda12.8-py3 cuda-version: "12.8" +======= + win-vs2022-cuda12_6-py3-build: + name: win-vs2022-cuda12.6-py3 + uses: ./.github/workflows/_win-build.yml + needs: get-label-type + with: + build-environment: win-vs2022-cuda12.6-py3 + cuda-version: "12.6" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" secrets: inherit @@ -202,8 +237,14 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ +<<<<<<< HEAD { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, +======= + { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" }, + { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" }, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ]} secrets: inherit @@ -221,6 +262,7 @@ jobs: build-environment: linux-jammy-rocm-py3.10 docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} +<<<<<<< HEAD tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" secrets: inherit @@ -230,10 +272,23 @@ jobs: needs: get-label-type with: build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80 +======= + tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl" + secrets: inherit + + # NB: Keep this in sync with inductor-perf-test-nightly.yml + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.0' secrets: inherit +<<<<<<< HEAD # Test cross-compiled models with Windows libs extracted from wheel cross-compile-linux-test: name: cross-compile-linux-test @@ -251,14 +306,21 @@ jobs: ]} secrets: inherit +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) verify-cachebench-cpu-build: name: verify-cachebench-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks +======= + build-environment: linux-jammy-py3.9-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -272,6 +334,7 @@ jobs: - verify-cachebench-cpu-build - target-determination with: +<<<<<<< HEAD build-environment: linux-jammy-py3.10-gcc11 docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} @@ -311,3 +374,9 @@ jobs: build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 secrets: inherit +======= + build-environment: linux-jammy-py3.9-gcc11 + docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} + secrets: inherit +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index 5c456c607c887..e18659f799f1f 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -28,7 +28,11 @@ jobs: check-latest: false cache: pip architecture: x64 +<<<<<<< HEAD - run: pip install pyyaml==6.0.2 +======= + - run: pip install pyyaml==6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup committer id run: | @@ -59,6 +63,7 @@ jobs: # on the PR appear in chronological order (timing issues can shuffle them around) sleep 60 fi +<<<<<<< HEAD # Require a comment id for merge operations if [ -z "${COMMENT_ID}" ]; then @@ -72,6 +77,24 @@ jobs: python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}" else python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" +======= + if [ -n "${FORCE}" ]; then + if [ -n "${COMMENT_ID}" ]; then + python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}" + else + python3 .github/scripts/trymerge.py --force "${PR_NUM}" + fi + elif [ -n "${IGNORE_CURRENT}" ]; then + if [ -n "${COMMENT_ID}" ]; then + python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}" + else + python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}" + fi + elif [ -n "${COMMENT_ID}" ]; then + python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" + else + python3 .github/scripts/trymerge.py "${PR_NUM}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi - name: Comment on Canceled if: ${{ cancelled() && steps.checkout.outcome == 'success' }} diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml index 1a8e00e4390be..43275303c3acc 100644 --- a/.github/workflows/tryrebase.yml +++ b/.github/workflows/tryrebase.yml @@ -25,7 +25,11 @@ jobs: architecture: x64 check-latest: false cache: pip +<<<<<<< HEAD - run: pip install pyyaml==6.0.2 +======= + - run: pip install pyyaml==6.0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Setup committer id run: | diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index b5955127d9fb3..038ac58bd4efb 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -12,9 +12,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +<<<<<<< HEAD permissions: id-token: write contents: read +======= +permissions: read-all +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: # There must be at least one job here to satisfy GitHub action workflow syntax @@ -46,7 +50,11 @@ jobs: get-label-type: name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index b3fc9efdf667f..62eadf26e2fd7 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -7,7 +7,11 @@ on: concurrency: group: ${{ github.workflow }} +<<<<<<< HEAD cancel-in-progress: true +======= + cancel-in-progress: false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) jobs: do_update_viablestrict: @@ -18,12 +22,20 @@ jobs: environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }} steps: - name: Update viable/strict +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/update-viablestrict@main +======= + uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) id: update_viablestrict with: repository: pytorch/pytorch stable-branch: viable/strict +<<<<<<< HEAD requires: '[\"pull\", \"trunk\", \"lint\", \"linux-aarch64\"]' +======= + requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]' +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }} clickhouse-url: ${{ secrets.CLICKHOUSE_URL }} clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }} @@ -48,7 +60,10 @@ jobs: echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json" pip install awscli==1.29.40 aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json" +<<<<<<< HEAD # Push new viable/strict tag cd pytorch/pytorch git push origin "${LATEST_SHA}:refs/tags/viable/strict/${TIME}" +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fi diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index a1b8c38141ae8..8a3eccc523a22 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -17,7 +17,11 @@ jobs: contents: read steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml index 9aecaad0e068f..b0757b8622850 100644 --- a/.github/workflows/upload-test-stats-while-running.yml +++ b/.github/workflows/upload-test-stats-while-running.yml @@ -16,7 +16,11 @@ jobs: runs-on: linux.2xlarge steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index f77b6081b776a..c5d6a4fef1158 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -14,7 +14,10 @@ on: - inductor-periodic - rocm - rocm-mi300 +<<<<<<< HEAD - rocm-mi355 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - inductor-micro-benchmark - inductor-micro-benchmark-x86 - inductor-cu124 @@ -58,7 +61,11 @@ jobs: run: echo "${TRIGGERING_WORKFLOW}" - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - name: Configure aws credentials uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml index 07471619437a2..f8a9fb57ec0e8 100644 --- a/.github/workflows/upload-torch-dynamo-perf-stats.yml +++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml @@ -32,7 +32,11 @@ jobs: name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }} steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml index 5702562006055..159d1e0873eba 100644 --- a/.github/workflows/upload_test_stats_intermediate.yml +++ b/.github/workflows/upload_test_stats_intermediate.yml @@ -17,7 +17,11 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch +<<<<<<< HEAD uses: pytorch/pytorch/.github/actions/checkout-pytorch@main +======= + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index b95dadd5f2b1c..b820ca44ef3be 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -22,7 +22,11 @@ jobs: fetch-depth: 0 - name: update-xla-commit-hash continue-on-error: true +<<<<<<< HEAD uses: pytorch/test-infra/.github/actions/update-commit-hash@main +======= + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: repo-name: xla branch: master diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index 36f603f70fde7..79d4b2d3fe3db 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -5,10 +5,13 @@ on: tags: - ciflow/xpu/* workflow_dispatch: +<<<<<<< HEAD schedule: # Run 3 times on weekdays and less frequently on weekends. - cron: 45 0,8,16 * * 1-5 - cron: 45 4 * * 0,6 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} @@ -19,13 +22,18 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type +<<<<<<< HEAD uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main +======= + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} +<<<<<<< HEAD linux-jammy-xpu-n-1-py3_10-build: name: linux-jammy-xpu-n-1-py3.10 uses: ./.github/workflows/_linux-build.yml @@ -36,6 +44,18 @@ jobs: build-environment: linux-jammy-xpu-n-1-py3.10 docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3 runner: linux.c7i.12xlarge +======= + linux-jammy-xpu-2025_0-py3_9-build: + name: linux-jammy-xpu-2025.0-py3.9 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + sync-tag: linux-xpu-2025-0-build + runner_prefix: ${{ needs.get-label-type.outputs.label-type }} + build-environment: linux-jammy-xpu-2025.0-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3 + runner: linux.12xlarge +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, @@ -47,6 +67,7 @@ jobs: ]} secrets: inherit +<<<<<<< HEAD linux-jammy-xpu-n-py3_10-build: name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml @@ -78,10 +99,38 @@ jobs: name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_xpu-test.yml needs: linux-jammy-xpu-n-py3_10-build +======= + linux-jammy-xpu-2025_1-py3_9-build: + name: linux-jammy-xpu-2025.1-py3.9 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + sync-tag: linux-xpu-2025-1-build + runner_prefix: ${{ needs.get-label-type.outputs.label-type }} + build-environment: linux-jammy-xpu-2025.1-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 + runner: linux.12xlarge + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + ]} + secrets: inherit + + linux-jammy-xpu-2025_1-py3_9-test: + name: linux-jammy-xpu-2025.1-py3.9 + uses: ./.github/workflows/_xpu-test.yml + needs: linux-jammy-xpu-2025_1-py3_9-build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) permissions: id-token: write contents: read with: +<<<<<<< HEAD build-environment: linux-jammy-xpu-n-py3.10 docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }} @@ -93,11 +142,37 @@ jobs: uses: ./.github/workflows/_win-build.yml with: build-environment: win-vs2022-xpu-n-1-py3 +======= + build-environment: linux-jammy-xpu-2025.1-py3.9 + docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }} + secrets: inherit + + windows-xpu-2025_0-build: + if: github.repository_owner == 'pytorch' + name: win-vs2022-xpu-2025_0-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2022-xpu-py3 + cuda-version: cpu + use-xpu: true + xpu-version: '2025.0' + vc-year: '2022' + secrets: inherit + + windows-xpu-2025_1-build: + if: github.repository_owner == 'pytorch' + name: win-vs2022-xpu-2025_1-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2022-xpu-py3 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cuda-version: cpu use-xpu: true xpu-version: '2025.1' vc-year: '2022' secrets: inherit +<<<<<<< HEAD windows-xpu-n-build: if: github.repository_owner == 'pytorch' @@ -110,3 +185,5 @@ jobs: xpu-version: '2025.2' vc-year: '2022' secrets: inherit +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.gitignore b/.gitignore index 447ef777e9291..7fd4227d952a1 100644 --- a/.gitignore +++ b/.gitignore @@ -32,7 +32,10 @@ coverage.xml aten/build/ aten/src/ATen/Config.h aten/src/ATen/cuda/CUDAConfig.h +<<<<<<< HEAD aten/src/ATen/hip/HIPConfig.h +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) benchmarks/.data caffe2/cpp_test/ dist/ @@ -82,13 +85,20 @@ torch/return_types.pyi torch/nn/functional.pyi torch/utils/data/datapipes/datapipe.pyi torch/csrc/autograd/generated/* +<<<<<<< HEAD torch/csrc/functionalization/generated/* +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch/csrc/lazy/generated/*.[!m]* torch_compile_debug/ # Listed manually because some files in this directory are not generated torch/testing/_internal/generated/annotated_fn_args.py torch/testing/_internal/data/*.pt +<<<<<<< HEAD torch/headeronly/version.h +======= +torch/csrc/api/include/torch/version.h +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch/csrc/cudnn/cuDNN.cpp torch/csrc/generated torch/csrc/generic/TensorMethods.cpp @@ -148,9 +158,12 @@ merge_record.json torchgen/packaged/* !torchgen/packaged/README.md +<<<<<<< HEAD # This file is injected by ROCm build scripts to bootstrap in torch/__init__.py. torch/_rocm_init.py +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # IPython notebook checkpoints .ipynb_checkpoints @@ -260,9 +273,12 @@ gen .pytest_cache aten/build/* +<<<<<<< HEAD # Linker scripts for prioritized text optimization cmake/linker_script.ld +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Bram plsdontbreak @@ -374,7 +390,10 @@ third_party/ruy/ third_party/glog/ # Virtualenv +<<<<<<< HEAD .venv/ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) venv/ # Log files @@ -394,6 +413,9 @@ android/pytorch_android_torchvision/.cxx # Claude Code local configuration CLAUDE.local.md +<<<<<<< HEAD /test_*.py /debug_*.py CLAUDE_CONTEXT/ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.gitmodules b/.gitmodules index ba1bca8c7e6c6..3582acaa0af54 100644 --- a/.gitmodules +++ b/.gitmodules @@ -87,7 +87,11 @@ url = https://github.com/NVIDIA/cudnn-frontend.git [submodule "third_party/kineto"] path = third_party/kineto +<<<<<<< HEAD url = https://github.com/pytorch/kineto.git +======= + url = https://github.com/pytorch/kineto +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [submodule "third_party/pocketfft"] path = third_party/pocketfft url = https://github.com/mreineck/pocketfft @@ -129,6 +133,9 @@ [submodule "third_party/flash-attention"] path = third_party/flash-attention url = https://github.com/Dao-AILab/flash-attention.git +<<<<<<< HEAD [submodule "third_party/aiter"] path = third_party/aiter url = https://github.com/ROCm/aiter.git +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/.lintrunner.toml b/.lintrunner.toml index 26ade791a1bde..0a48295057c14 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -13,12 +13,19 @@ exclude_patterns = [ '**/fb/**', 'functorch/docs/**', 'functorch/examples/**', +<<<<<<< HEAD 'functorch/docs/source/tutorials/**', +======= + 'functorch/notebooks/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', 'scripts/**', 'test/generated_type_hints_smoketest.py', +<<<<<<< HEAD 'test/test_torchfuzz_repros.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # CPython tests 'test/dynamo/cpython/**', # Tests from the NumPy test suite @@ -28,7 +35,10 @@ exclude_patterns = [ 'torch/lib/**', 'venv/**', '**/*.pyi', +<<<<<<< HEAD "tools/experimental/torchfuzz/**", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'tools/test/test_selective_build.py', ] command = [ @@ -41,6 +51,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'flake8==7.3.0', 'flake8-bugbear==24.12.12', 'flake8-comprehensions==3.16.0', @@ -52,6 +63,19 @@ init_command = [ 'pycodestyle==2.14.0', 'pyflakes==3.4.0', 'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"', +======= + 'flake8==6.1.0', + 'flake8-bugbear==23.3.23', + 'flake8-comprehensions==3.15.0', + 'flake8-executable==2.1.3', + 'flake8-logging-format==0.9.0', + 'flake8-pyi==23.3.1', + 'flake8-simplify==0.19.3', + 'mccabe==0.7.0', + 'pycodestyle==2.11.1', + 'pyflakes==3.1.0', + 'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] @@ -124,8 +148,11 @@ is_formatter = true [[linter]] code = 'MYPY' include_patterns = [ +<<<<<<< HEAD 'setup.py', 'functorch/dim/**/*.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/**/*.py', 'torch/**/*.pyi', 'caffe2/**/*.py', @@ -135,7 +162,11 @@ include_patterns = [ 'test/test_complex.py', 'test/test_datapipe.py', 'test/test_futures.py', +<<<<<<< HEAD 'test/test_numpy_interop.py', +======= + # 'test/test_numpy_interop.py', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/test_torch.py', 'test/test_type_hints.py', 'test/test_type_info.py', @@ -155,22 +186,37 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', +======= + 'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'numpy==2.1.0 ; python_version >= "3.12"', 'expecttest==0.3.0', 'mypy==1.16.0', 'sympy==1.13.3', 'types-requests==2.27.25', +<<<<<<< HEAD 'types-pyyaml==6.0.2', +======= + 'types-pyyaml==6.0.1', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'types-tabulate==0.8.8', 'types-protobuf==5.29.1.20250403', 'types-setuptools==79.0.0.20250422', 'types-jinja2==2.11.9', 'types-colorama==0.4.6', +<<<<<<< HEAD 'filelock==3.18.0', 'junitparser==2.1.1', 'rich==14.1.0', 'pyyaml==6.0.2', +======= + 'filelock==3.13.1', + 'junitparser==2.1.1', + 'rich==10.9.0', + 'pyyaml==6.0.1', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'optree==0.13.0', 'dataclasses-json==0.6.7', 'pandas==2.2.3', @@ -198,7 +244,10 @@ exclude_patterns = [ 'tools/test/gen_operators_yaml_test.py', 'tools/test/gen_oplist_test.py', 'tools/test/test_selective_build.py', +<<<<<<< HEAD 'tools/experimental/torchfuzz/**', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] command = [ 'python3', @@ -209,6 +258,7 @@ command = [ '@{{PATHSFILE}}' ] +<<<<<<< HEAD [[linter]] code = 'PYREFLY' @@ -249,6 +299,8 @@ init_command = [ 'types-python-dateutil==2.9.0.20251008' ] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [[linter]] code = 'CLANGTIDY' include_patterns = [ @@ -275,8 +327,12 @@ include_patterns = [ 'c10/**/*.cpp', 'c10/**/*.h', 'torch/*.h', +<<<<<<< HEAD 'torch/_inductor/codegen/aoti_runtime/*.h', 'torch/_inductor/codegen/aoti_runtime/*.cpp', +======= + 'torch/_inductor/codegen/aoti_runtime/interface.cpp', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/csrc/*.h', 'torch/csrc/*.cpp', 'torch/csrc/**/*.h', @@ -544,7 +600,11 @@ include_patterns = [ '**/*.h', ] exclude_patterns = [ +<<<<<<< HEAD 'torch/headeronly/macros/Macros.h', +======= + 'c10/macros/Macros.h', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] command = [ 'python3', @@ -567,7 +627,11 @@ include_patterns = [ '**/*.h', ] exclude_patterns = [ +<<<<<<< HEAD 'torch/headeronly/macros/Macros.h', +======= + 'c10/macros/Macros.h', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] command = [ 'python3', @@ -627,7 +691,11 @@ exclude_patterns = [ command = [ 'python3', 'tools/linter/adapters/grep_linter.py', +<<<<<<< HEAD '--pattern=#include >>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) '--allowlist-pattern=#include ', '--linter-name=PYBIND11_INCLUDE', '--match-first-only', @@ -833,7 +901,12 @@ exclude_patterns = [ command = [ 'python3', 'tools/linter/adapters/grep_linter.py', +<<<<<<< HEAD '--pattern=(cudaSetDevice|cudaGetDevice)\\(', +======= + '--pattern=cudaSetDevice(', + '--pattern=cudaGetDevice(', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) '--linter-name=RAWCUDADEVICE', '--error-name=raw CUDA API usage', """--error-description=\ @@ -1007,6 +1080,10 @@ exclude_patterns = [ 'test/jit/**', # should be run through test/test_jit.py 'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py 'test/fx/**', # should be run through test/test_fx.py +<<<<<<< HEAD +======= + 'test/bottleneck_test/**', # excluded by test/run_test.py +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/package/**', # excluded by test/run_test.py 'test/distributed/argparse_util_test.py', 'test/distributed/bin/test_script.py', @@ -1137,8 +1214,16 @@ command = [ [[linter]] code = 'WORKFLOWSYNC' include_patterns = [ +<<<<<<< HEAD '.github/workflows/*.yml', '.github/workflows/*.yaml', +======= + '.github/workflows/pull.yml', + '.github/workflows/trunk.yml', + '.github/workflows/periodic.yml', + '.github/workflows/mac-mps.yml', + '.github/workflows/slow.yml', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] command = [ 'python3', @@ -1150,7 +1235,11 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'pyyaml==6.0.2', +======= + 'PyYAML==6.0.1', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] [[linter]] @@ -1172,7 +1261,11 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'pyyaml==6.0.2', +======= + 'PyYAML==6.0.1', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] [[linter]] @@ -1197,6 +1290,7 @@ exclude_patterns = [ 'torch/_vendor/**', 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', +<<<<<<< HEAD 'torch/utils/model_dump/preact.mjs', # These files are all grandfathered in, feel free to remove from this list # as necessary @@ -1204,6 +1298,31 @@ exclude_patterns = [ 'aten/src/ATen/native/[a-pA-P]*/**', 'aten/src/ATen/[a-mA-M]*/**', 'test/**', +======= + # These files are all grandfathered in, feel free to remove from this list + # as necessary + # NOTE: remove the patterns in the order they are listed + 'aten/**', + 'aten/src/ATen/native/**', + 'aten/src/ATen/native/q*/**', + 'aten/src/ATen/native/[a-pA-P]*/**', + 'aten/src/ATen/[a-mA-M]*/**', + 'test/**', + 'test/test_*', + 'test/[a-hA-h]*/**', + 'test/inductor/**', + 'test/dynamo/**', + 'test/distributed/**', + 'torch/**', + 'torch/_*/**', + 'torch/ao/**', + 'torch/fx/**', + 'torch/distributed/tensor/**', + 'torch/[j-o]*/**', + 'torch/utils/**', + 'torch/csrc/jit/**', + 'torch/csrc/jit/[a-o]*/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] init_command = [ 'python3', @@ -1298,7 +1417,10 @@ exclude_patterns = [ 'test/test_masked.py', 'test/test_maskedtensor.py', 'test/test_matmul_cuda.py', +<<<<<<< HEAD 'test/test_scaled_matmul_cuda.py', +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'test/test_meta.py', 'test/test_metal.py', 'test/test_mkl_verbose.py', @@ -1450,6 +1572,11 @@ exclude_patterns = [ 'torch/utils/benchmark/utils/timer.py', 'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', 'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', +<<<<<<< HEAD +======= + 'torch/utils/bottleneck/__init__.py', + 'torch/utils/bottleneck/__main__.py', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'torch/utils/bundled_inputs.py', 'torch/utils/checkpoint.py', 'torch/utils/collect_env.py', @@ -1490,13 +1617,22 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'usort==1.0.8.post1', 'isort==6.0.1', 'ruff==0.13.1', # sync with RUFF +======= + '--no-black-binary', + 'black==23.12.1', + 'usort==1.0.8.post1', + 'isort==6.0.1', + 'ruff==0.11.13', # sync with RUFF +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] is_formatter = true [[linter]] +<<<<<<< HEAD code = 'PYPROJECT' command = [ 'python3', @@ -1541,6 +1677,8 @@ init_command = [ ] [[linter]] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) code = 'COPYRIGHT' include_patterns = ['**'] exclude_patterns = [ @@ -1606,10 +1744,17 @@ include_patterns = [ exclude_patterns = [ 'caffe2/**', 'functorch/docs/**', +<<<<<<< HEAD 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', 'test/dynamo/cpython/**', 'test/test_torchfuzz_repros.py', +======= + 'functorch/notebooks/**', + 'torch/_inductor/fx_passes/serialized_patterns/**', + 'torch/_inductor/autoheuristic/artifacts/**', + 'test/dynamo/cpython/**', +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 'scripts/**', 'third_party/**', 'fb/**', @@ -1627,7 +1772,11 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', +<<<<<<< HEAD 'ruff==0.13.1', # sync with PYFMT +======= + 'ruff==0.11.13', # sync with PYFMT +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ] is_formatter = true @@ -1636,10 +1785,14 @@ is_formatter = true # the same line, merge conflicts should not arise in git or hg [[linter]] code = 'MERGE_CONFLICTLESS_CSV' +<<<<<<< HEAD include_patterns = [ 'benchmarks/dynamo/ci_expected_accuracy/*.csv', 'benchmarks/dynamo/pr_time_benchmarks/expected_results.csv', ] +======= +include_patterns = ['benchmarks/dynamo/ci_expected_accuracy/*.csv'] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) command = [ 'python3', 'tools/linter/adapters/no_merge_conflict_csv_linter.py', @@ -1830,6 +1983,7 @@ include_patterns = [ 'torch/header_only_apis.txt', ] is_formatter = false +<<<<<<< HEAD [[linter]] @@ -1839,3 +1993,5 @@ command = [ "python3", "tools/linter/adapters/gb_registry_linter.py", ] +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/AGENTS.md b/AGENTS.md index 3d5436a02a85d..dd27ff6213af6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,5 @@ - This is the only AGENTS.md, there are no recursive AGENTS.md +<<<<<<< HEAD - When you are working on a bug, first create a standalone file that reproduces the bug and verify it fails in the expected way. Use this to test if your changes work. Once the change is passing, find an appropriate @@ -15,3 +16,5 @@ - git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch - git stash pop - Resolve conflicts if necessary +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/BUILD.bazel b/BUILD.bazel index 4737a2a0c486c..2ac9a0823ad87 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -13,9 +13,12 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources") load("//:tools/bazel.bzl", "rules") +<<<<<<< HEAD # Export files for use by torch/headeronly (where version.h generation now lives) exports_files(["version.txt"]) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) define_targets(rules = rules) COMMON_COPTS = [ @@ -94,8 +97,11 @@ generated_cpu_cpp = [ "aten/src/ATen/NativeMetaFunctions.h", "aten/src/ATen/RegistrationDeclarations.h", "aten/src/ATen/VmapGeneratedPlumbing.h", +<<<<<<< HEAD "aten/src/ATen/ViewMetaClasses.h", "aten/src/ATen/ViewMetaClasses.cpp", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "aten/src/ATen/core/aten_interned_strings.h", "aten/src/ATen/core/enum_tag.h", "aten/src/ATen/core/TensorBody.h", @@ -284,7 +290,10 @@ header_template_rule( "@AT_BLAS_F2C@": "0", "@AT_BLAS_USE_CBLAS_DOT@": "1", "@AT_KLEIDIAI_ENABLED@": "0", +<<<<<<< HEAD "@AT_USE_EIGEN_SPARSE@": "0", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }, ) @@ -685,7 +694,10 @@ cc_library( [ "torch/*.h", "torch/csrc/**/*.h", +<<<<<<< HEAD "torch/nativert/**/*.h", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "torch/csrc/distributed/c10d/**/*.hpp", "torch/lib/libshm/*.h", ], @@ -693,9 +705,13 @@ cc_library( "torch/csrc/*/generated/*.h", "torch/csrc/jit/serialization/mobile_bytecode_generated.h", ] + torch_cuda_headers, +<<<<<<< HEAD ) + GENERATED_AUTOGRAD_CPP + [ "//torch/headeronly:version_h", ], +======= + ) + GENERATED_AUTOGRAD_CPP + [":version_h"], +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) includes = [ "third_party/kineto/libkineto/include", "torch/csrc", @@ -754,7 +770,10 @@ cc_library( "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu", "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu", "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", +<<<<<<< HEAD "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", ], )) + torch_sources, @@ -840,6 +859,39 @@ pybind_extension( ], ) +<<<<<<< HEAD +======= +cc_library( + name = "functorch", + hdrs = glob([ + "functorch/csrc/dim/*.h", + ]), + srcs = glob([ + "functorch/csrc/dim/*.cpp", + ]), + deps = [ + ":aten_nvrtc", + ":torch_python", + "@pybind11", + ], +) + +pybind_extension( + name = "functorch/_C", + copts=[ + "-DTORCH_EXTENSION_NAME=_C" + ], + srcs = [ + "functorch/csrc/init_dim_only.cpp", + ], + deps = [ + ":functorch", + ":torch_python", + ":aten_nvrtc", + ], +) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cc_binary( name = "torch/bin/torch_shm_manager", srcs = [ @@ -880,6 +932,10 @@ py_library( ], data = [ ":torch/_C.so", +<<<<<<< HEAD +======= + ":functorch/_C.so", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ":torch/bin/torch_shm_manager", ], ) @@ -1082,7 +1138,10 @@ test_suite( "aten/src/ATen/templates/LazyNonNativeIr.h", "aten/src/ATen/templates/RegisterDispatchKey.cpp", "aten/src/ATen/templates/RegisterDispatchDefinitions.ini", +<<<<<<< HEAD "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp", +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/tags.yaml", "aten/src/ATen/native/ts_native_functions.yaml", diff --git a/CMakeLists.txt b/CMakeLists.txt index f081d8166d7f7..7f23de84e9d36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,8 @@ cmake_minimum_required(VERSION 3.27 FATAL_ERROR) +<<<<<<< HEAD +======= +# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this # sometimes makes XCode C compiler gets detected as "Clang", even when the C++ @@ -25,7 +29,11 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) endif() # ---[ Project and semantic versioning. +<<<<<<< HEAD project(Torch CXX C HIP) +======= +project(Torch CXX C) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(LINUX TRUE) @@ -56,11 +64,19 @@ set(CMAKE_C_STANDARD # ---[ Utils include(cmake/public/utils.cmake) +<<<<<<< HEAD # --- [ Check that minimal gcc version is 9.2+ if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2) message( FATAL_ERROR "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}" +======= +# --- [ Check that minimal gcc version is 9.3+ +if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3) + message( + FATAL_ERROR + "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) endif() @@ -232,16 +248,23 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on" option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF) option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) +<<<<<<< HEAD option(USE_LSAN "Use Leak Sanitizer" OFF) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) option(USE_TSAN "Use Thread Sanitizer" OFF) option(USE_CUDA "Use CUDA" ON) option(USE_XPU "Use XPU" ON) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) +<<<<<<< HEAD cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX OR WIN32" OFF) cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF) option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF) +======= +cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF @@ -253,6 +276,10 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF) option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) option(USE_KINETO "Use Kineto profiling library" ON) option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) +<<<<<<< HEAD +======= +option(USE_FAKELOWP "Use FakeLowp operators" OFF) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) option(USE_GFLAGS "Use GFLAGS" OFF) option(USE_GLOG "Use GLOG" OFF) option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF) @@ -261,6 +288,7 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF) option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF) option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) +<<<<<<< HEAD option(USE_DISTRIBUTED "Use distributed" ON) cmake_dependent_option(USE_NCCL "Use NCCL" ON "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) @@ -273,6 +301,16 @@ cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +======= +cmake_dependent_option(USE_NCCL "Use NCCL" ON + "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +cmake_dependent_option(USE_XCCL "Use XCCL" ON + "USE_XPU;UNIX;NOT APPLE" OFF) +cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) +cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) +cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" + OFF) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) option(USE_NNAPI "Use NNAPI" OFF) option(USE_NNPACK "Use NNPACK" ON) cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX" @@ -289,7 +327,10 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." option(USE_PROF "Use profiling" OFF) option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON) option(USE_SNPE "Use Qualcomm's SNPE library" OFF) +<<<<<<< HEAD option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) option(USE_SYSTEM_EIGEN_INSTALL "Use system Eigen instead of the one under third_party" OFF) cmake_dependent_option( @@ -326,6 +367,10 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN}) cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN" OFF) option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF) +<<<<<<< HEAD +======= +option(USE_DISTRIBUTED "Use distributed" ON) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cmake_dependent_option( USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON "USE_DISTRIBUTED" OFF) @@ -378,6 +423,7 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON "CPU_AARCH64" OFF) +<<<<<<< HEAD # prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. set(USE_PRIORITIZED_TEXT_DEFAULT OFF) if(LINUX AND CPU_AARCH64) @@ -391,6 +437,14 @@ option(USE_MIMALLOC "Use mimalloc" OFF) # on Windows and AArch64. option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF) if(WIN32 OR (CPU_AARCH64 AND NOT APPLE)) +======= + +option(USE_MIMALLOC "Use mimalloc" OFF) +# Enable third party mimalloc library to improve memory allocation performance +# on Windows. +option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF) +if(WIN32) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set(USE_MIMALLOC ON) # Not enable USE_MIMALLOC_ON_MKL due to it caused issue: @@ -442,7 +496,11 @@ if(WIN32) message( WARNING "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " +<<<<<<< HEAD "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv." +======= + "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv." +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) else() set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../) @@ -574,7 +632,11 @@ if(MSVC) set(CMAKE_NINJA_CMCLDEPS_RC OFF) if(MSVC_Z7_OVERRIDE) # CMake set debug flags to use /Z7 +<<<<<<< HEAD set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:Embedded>") +======= + set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() foreach( flag_var @@ -663,11 +725,14 @@ endif(MSVC) string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") +<<<<<<< HEAD # Set linker max-page-size to 64KiB on AArch64 Linux if(LINUX AND CPU_AARCH64) add_link_options_if_supported("-z,max-page-size=0x10000") endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # applicable to mobile are disabled by this variable. Setting # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it @@ -849,11 +914,18 @@ include(ExternalProject) # ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and # CMAKE_SYSTEM_PROCESSOR thinks its 64bit +<<<<<<< HEAD if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") message(WARNING "x64 operating system is required for FBGEMM. " "Not compiling with FBGEMM. " "Turn this warning off by USE_FBGEMM=OFF.") +======= +if(USE_FBGEMM + AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL + 4) + OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86")) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set(USE_FBGEMM OFF) endif() @@ -888,6 +960,7 @@ cmake_dependent_option( "(USE_CUDA AND NOT MSVC) OR USE_ROCM" OFF) +<<<<<<< HEAD IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH) message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build") @@ -912,6 +985,8 @@ cmake_dependent_option( if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem # Eff Attention won't cmake_dependent_option( @@ -945,10 +1020,13 @@ if(USE_FBGEMM) string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM") endif() +<<<<<<< HEAD if(USE_FBGEMM_GENAI) string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI") endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(USE_PYTORCH_QNNPACK) string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK") endif() @@ -1225,7 +1303,11 @@ if(APPLE) string( APPEND CMAKE_SHARED_LINKER_FLAGS +<<<<<<< HEAD " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal -weak_framework IOKit" +======= + " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) # To suppress MPSGraph availability warnings append_cxx_flag_if_supported("-Wno-unguarded-availability-new" @@ -1234,6 +1316,13 @@ if(APPLE) append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS) endif() +<<<<<<< HEAD +======= +if(USE_XPU) + string(APPEND CMAKE_CXX_FLAGS " -DUSE_XPU") +endif() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(EMSCRIPTEN) string( APPEND @@ -1285,7 +1374,10 @@ if(USE_MIMALLOC AND USE_MIMALLOC_ON_MKL) endif() # ---[ Main build +<<<<<<< HEAD add_subdirectory(torch/headeronly) # headeronly headers +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) add_subdirectory(c10) add_subdirectory(caffe2) @@ -1395,6 +1487,13 @@ endif() include(cmake/Summary.cmake) caffe2_print_configuration_summary() +<<<<<<< HEAD +======= +if(BUILD_FUNCTORCH) + add_subdirectory(functorch) +endif() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Parse custom debug info if(DEFINED USE_CUSTOM_DEBINFO) string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") @@ -1433,6 +1532,7 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() +<<<<<<< HEAD if(USE_PRIORITIZED_TEXT_FOR_LD) add_compile_options( @@ -1487,3 +1587,5 @@ else() ]]) endif() endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/CODEOWNERS b/CODEOWNERS index cc249dc4f43a2..ca5526e3eb5e6 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -14,6 +14,10 @@ /torch/csrc/autograd/ @albanD @soulitzer /torch/autograd/ @albanD @soulitzer /tools/autograd/ @albanD @soulitzer +<<<<<<< HEAD +======= +/torch/header_only_apis.txt @janeyx99 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /torch/nn/ @albanD @jbschlosser @mikaylagawarecki /torch/optim/ @albanD @janeyx99 /test/test_public_bindings.py @albanD @@ -50,12 +54,21 @@ nn/qat/ @jerryzh168 /torch/csrc/distributed/c10d/Ops.* @kwen2501 # ONNX Export +<<<<<<< HEAD /torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby /torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre /torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre /torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre /torch/onnx/ @titaiwangms @xadupre @justinchuby /test/onnx/ @titaiwangms @xadupre @justinchuby +======= +/torch/_dynamo/backends/onnxrt.py @wschin +/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 +/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 +/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 +/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin +/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # CI /.ci @pytorch/pytorch-dev-infra @@ -135,7 +148,11 @@ torch/profiler/ @sraikund16 test/functorch/test_aotdispatch.py @ezyang @Chillee # Dataloader +<<<<<<< HEAD torch/utils/data/ @divyanshk @ramanishsingh @scotts +======= +torch/utils/data/ @divyanshk @ramanishsingh +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # hipify torch/utils/hipify/ @jeffdaily @jithunnair-amd @@ -164,7 +181,10 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd # torch.export /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi +<<<<<<< HEAD /torch/_export/serde/schema.py @SherlockNoMad @zhxchen17 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Dynamic Shapes /torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka @@ -181,6 +201,7 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd /torch/csrc/jit/python/init.cpp @mikaylagawarecki # CUDA and CUDA math libraries +<<<<<<< HEAD aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A @@ -190,12 +211,24 @@ torch/cuda/ @eqy @syed-ahmed @Aidyn-A torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A +======= +aten/src/ATen/cuda/ @eqy @syed-ahmed +aten/src/ATen/cudnn/ @eqy @syed-ahmed +aten/src/ATen/native/cuda/ @eqy @syed-ahmed +aten/src/ATen/native/cudnn/ @eqy @syed-ahmed +c10/cuda @eqy @syed-ahmed +torch/cuda/ @eqy @syed-ahmed +torch/csrc/cuda/ @eqy @syed-ahmed +torch/backends/cuda/ @eqy @syed-ahmed +torch/backends/cudnn/ @eqy @syed-ahmed +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # PyTree utilities /torch/utils/_pytree.py @XuehaiPan /torch/utils/_cxx_pytree.py @XuehaiPan /torch/utils/pytree/ @XuehaiPan /torch/_dynamo/polyfills/pytree.py @XuehaiPan +<<<<<<< HEAD # Relating to libtorch ABI /torch/csrc/stable/ @janeyx99 @mikaylagawarecki @@ -215,3 +248,5 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58 /test/test_scaled_matmul_cuda.py @drisspg @slayton58 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4c46077f9db71..8c35edba26b6a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,13 +81,18 @@ git remote add upstream git@github.com:pytorch/pytorch.git make setup-env # Or run `make setup-env-cuda` for pre-built CUDA binaries # Or run `make setup-env-rocm` for pre-built ROCm binaries +<<<<<<< HEAD source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows +======= +source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` ### Tips and Debugging * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below. +<<<<<<< HEAD * When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder) This way you do not need to repeatedly install after modifying Python files (`.py`). @@ -101,6 +106,22 @@ source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder), would be sufficient to make change visible in `torch` package. +======= +* When installing with `python setup.py develop` (in contrast to `python setup.py install`) Python runtime will use + the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder) + This way you do not need to repeatedly install after modifying Python files (`.py`). + However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or + non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...). + + + One way to avoid running `python setup.py develop` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac, + is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following: + ```bash + pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd + ``` + Afterwards rebuilding a library (for example to rebuild `libtorch_cpu.so` issue `ninja torch_cpu` from `build` folder), + would be sufficient to make change visible in `torch` package. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) To reinstall, first uninstall all existing PyTorch installs. You may need to run `pip @@ -114,9 +135,15 @@ source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows pip uninstall torch ``` +<<<<<<< HEAD Next run `python setup.py clean`. After that, you can install in editable mode again. * If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps: +======= + Next run `python setup.py clean`. After that, you can install in `develop` mode again. + +* If you run into errors when running `python setup.py develop`, here are some debugging steps: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors. 2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many @@ -129,6 +156,7 @@ source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows git clean -xdf python setup.py clean git submodule update --init --recursive +<<<<<<< HEAD python -m pip install --group dev python -m pip install --no-build-isolation -v -e . ``` @@ -143,6 +171,15 @@ source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows python -m pip install --no-build-isolation -v -e . ``` +======= + python setup.py develop + ``` + 4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to + experiment with some environment variables, you can pass them into the command: + ```bash + ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop + ``` +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * If you run into issue running `git submodule update --init --recursive`. Please try the following: - If you encounter an error such as @@ -182,26 +219,39 @@ You can use this script to check out a new nightly branch with the following: ```bash ./tools/nightly.py checkout -b my-nightly-branch +<<<<<<< HEAD source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows +======= +source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`: ```bash ./tools/nightly.py checkout -b my-nightly-branch --cuda +<<<<<<< HEAD source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows +======= +source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` To install the nightly binaries built with ROCm, you can pass in the flag `--rocm`: ```bash ./tools/nightly.py checkout -b my-nightly-branch --rocm +<<<<<<< HEAD source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows +======= +source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` You can also use this tool to pull the nightly commits into the current branch: ```bash +<<<<<<< HEAD ./tools/nightly.py pull source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` @@ -212,6 +262,10 @@ pass in the `--python` argument: ```bash ./tools/nightly.py --python /path/to/python3.12 source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows +======= +./tools/nightly.py pull -p my-env +source my-env/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` Pulling will recreate a fresh virtual environment and reinstall the development @@ -267,7 +321,10 @@ dependencies as well as the nightly binaries into the repo directory. support for PyTorch. * [tools](tools) - Code generation scripts for the PyTorch library. See [README](tools/README.md) of this directory for more details. +<<<<<<< HEAD * [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * [test](test) - Python unit tests for PyTorch Python frontend. * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch functionality. @@ -303,7 +360,11 @@ The following packages should be installed with `pip`: - `pytest` - recommended to run tests more selectively Running ``` +<<<<<<< HEAD pip install --group dev +======= +pip install -r requirements.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` will install these dependencies for you. @@ -654,9 +715,15 @@ can be selected interactively with your mouse to zoom in on a particular part of the program execution timeline. The `--native` command-line option tells `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers for C++ code it may be necessary to compile PyTorch in debug mode by prepending +<<<<<<< HEAD your `python -m pip install -e . -v --no-build-isolation` call to compile PyTorch with `DEBUG=1`. Depending on your operating system it may also be necessary to run `py-spy` with root privileges. +======= +your `setup.py develop` call to compile PyTorch with `DEBUG=1`. Depending on +your operating system it may also be necessary to run `py-spy` with root +privileges. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) `py-spy` can also work in an `htop`-like "live profiling" mode and can be tweaked to adjust the stack sampling rate, see the `py-spy` readme for more @@ -664,10 +731,17 @@ details. ## Managing multiple build trees +<<<<<<< HEAD One downside to using `python -m pip install -e . -v --no-build-isolation` is that your development version of PyTorch will be installed globally on your account (e.g., if you run `import torch` anywhere else, the development version will be used). +======= +One downside to using `python setup.py develop` is that your development +version of PyTorch will be installed globally on your account (e.g., if +you run `import torch` anywhere else, the development version will be +used). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) If you want to manage multiple builds of PyTorch, you can make use of [venv environments](https://docs.python.org/3/library/venv.html) to maintain @@ -678,7 +752,11 @@ specific build of PyTorch. To set one up: python -m venv pytorch-myfeature source pytorch-myfeature/bin/activate # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows # if you run python now, torch will NOT be installed +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` ## C++ development tips @@ -716,9 +794,13 @@ variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTEN For example: ```bash +<<<<<<< HEAD DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 \ USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 \ python -m pip install --no-build-isolation -v -e . +======= +DEBUG=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=0 USE_QNNPACK=0 USE_XNNPACK=0 python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` For subsequent builds (i.e., when `build/CMakeCache.txt` exists), the build @@ -728,7 +810,11 @@ options. ### Code completion and IDE support +<<<<<<< HEAD When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate +======= +When using `python setup.py develop`, PyTorch will generate +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) a `compile_commands.json` file that can be used by many editors to provide command completion and error highlighting for PyTorch's C++ code. You need to `pip install ninja` to generate accurate @@ -789,7 +875,11 @@ If not, you can define these variables on the command line before invoking `setu export CMAKE_C_COMPILER_LAUNCHER=ccache export CMAKE_CXX_COMPILER_LAUNCHER=ccache export CMAKE_CUDA_COMPILER_LAUNCHER=ccache +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` #### Use a faster linker @@ -802,7 +892,11 @@ If you are editing a single file and rebuilding in a tight loop, the time spent Starting with CMake 3.29, you can specify the linker type using the [`CMAKE_LINKER_TYPE`](https://cmake.org/cmake/help/latest/variable/CMAKE_LINKER_TYPE.html) variable. For example, with `mold` installed: ```sh +<<<<<<< HEAD CMAKE_LINKER_TYPE=MOLD python -m pip install --no-build-isolation -v -e . +======= +CMAKE_LINKER_TYPE=MOLD python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` #### Use pre-compiled headers @@ -814,7 +908,11 @@ setting `USE_PRECOMPILED_HEADERS=1` either on first setup, or in the `CMakeCache.txt` file. ```sh +<<<<<<< HEAD USE_PRECOMPILED_HEADERS=1 python -m pip install --no-build-isolation -v -e . +======= +USE_PRECOMPILED_HEADERS=1 python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` This adds a build step where the compiler takes `` and essentially @@ -837,7 +935,11 @@ A compiler-wrapper to fix this is provided in `tools/nvcc_fix_deps.py`. You can this as a compiler launcher, similar to `ccache` ```bash export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache" +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` ### Rebuild few files with debug information @@ -1188,7 +1290,11 @@ build_with_asan() CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \ CXX_FLAGS="-pthread" \ USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \ +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= + python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } run_with_asan() diff --git a/Dockerfile b/Dockerfile index 331cf00593cb2..dceb1b1bb9663 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,7 +33,11 @@ RUN case ${TARGETPLATFORM} in \ *) MINICONDA_ARCH=x86_64 ;; \ esac && \ curl -fsSL -v -o ~/miniconda.sh -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${MINICONDA_ARCH}.sh" +<<<<<<< HEAD COPY requirements.txt requirements-build.txt . +======= +COPY requirements.txt . +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431 RUN chmod +x ~/miniconda.sh && \ bash ~/miniconda.sh -b -p /opt/conda && \ @@ -47,6 +51,7 @@ WORKDIR /opt/pytorch COPY . . RUN git submodule update --init --recursive +<<<<<<< HEAD FROM conda as conda-installs ARG PYTHON_VERSION=3.11 ARG CUDA_PATH=cu121 @@ -54,6 +59,28 @@ ARG INSTALL_CHANNEL=whl/nightly # Automatically set by buildx # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574 RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0 +======= +FROM conda as build +ARG CMAKE_VARS +WORKDIR /opt/pytorch +COPY --from=conda /opt/conda /opt/conda +COPY --from=submodule-update /opt/pytorch /opt/pytorch +RUN make triton +RUN --mount=type=cache,target=/opt/ccache \ + export eval ${CMAKE_VARS} && \ + TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + python setup.py install + +FROM conda as conda-installs +ARG PYTHON_VERSION=3.11 +ARG CUDA_PATH=cu121 +ARG CUDA_CHANNEL=nvidia +ARG INSTALL_CHANNEL=whl/nightly +# Automatically set by buildx +RUN /opt/conda/bin/conda update -y -n base -c defaults conda +RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ARG TARGETPLATFORM @@ -96,5 +123,9 @@ WORKDIR /workspace FROM official as dev # Should override the already installed version from the official-image stage +<<<<<<< HEAD COPY --from=conda /opt/conda /opt/conda COPY --from=submodule-update /opt/pytorch /opt/pytorch +======= +COPY --from=build /opt/conda /opt/conda +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/MANIFEST.in b/MANIFEST.in index bb8e488283a96..ddac0b58efb0d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ +<<<<<<< HEAD # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html # Include individual top-level files @@ -97,3 +98,36 @@ include .gitmodules # concern here. # [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features +======= +include MANIFEST.in +include CMakeLists.txt +include CITATION.cff +include LICENSE +include NOTICE +include .gitmodules +include build_variables.bzl +include mypy.ini +include requirements.txt +include ufunc_defs.bzl +include version.txt +recursive-include android *.* +recursive-include aten *.* +recursive-include binaries *.* +recursive-include c10 *.* +recursive-include caffe2 *.* +recursive-include cmake *.* +recursive-include torch *.* +recursive-include tools *.* +recursive-include test *.* +recursive-include docs *.* +recursive-include ios *.* +recursive-include third_party * +recursive-include test *.* +recursive-include benchmarks *.* +recursive-include scripts *.* +recursive-include mypy_plugins *.* +recursive-include modules *.* +recursive-include functorch *.* +prune */__pycache__ +global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/Makefile b/Makefile index 3db2b7aa44e76..3dc907d125f8f 100644 --- a/Makefile +++ b/Makefile @@ -57,8 +57,12 @@ setup-env-cuda: setup-env-rocm: $(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm" +<<<<<<< HEAD .PHONY: setup-lint setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml +======= +.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) @echo "Setting up lintrunner..." $(PIP) install lintrunner lintrunner init @@ -66,6 +70,12 @@ setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrun @mkdir -p .lintbin @sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256 +<<<<<<< HEAD +======= +.PHONY: setup-lint +setup-lint: .lintbin/.lintrunner.sha256 + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) .PHONY: lazy-setup-lint lazy-setup-lint: .lintbin/.lintrunner.sha256 @if [ ! -x "$(shell command -v lintrunner)" ]; then \ diff --git a/README.md b/README.md index a0c9b54c95a8b..abbaa4e36e1b3 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,11 @@ Elaborating Further: If you use NumPy, then you have used Tensors (a.k.a. ndarray). +<<<<<<< HEAD ![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png) +======= +![Tensor illustration](./docs/source/_static/img/tensor_illustration.png) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the computation by a huge amount. @@ -161,7 +165,11 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) #### Prerequisites If you are installing from source, you will need: +<<<<<<< HEAD - Python 3.10 or later +======= +- Python 3.9 or later +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) - Visual Studio or Visual Studio Build Tool (Windows only) @@ -200,7 +208,11 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA +<<<<<<< HEAD Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver, and NVIDIA hardware. +======= +Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) If you want to disable CUDA support, export the environment variable `USE_CUDA=0`. Other potentially useful environment variables may be found in `setup.py`. If @@ -228,7 +240,10 @@ If you want to disable Intel GPU support, export the environment variable `USE_X Other potentially useful environment variables may be found in `setup.py`. #### Get the PyTorch Source +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```bash git clone https://github.com/pytorch/pytorch cd pytorch @@ -242,8 +257,14 @@ git submodule update --init --recursive **Common** ```bash +<<<<<<< HEAD # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above pip install --group dev +======= +conda install cmake ninja +# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below +pip install -r requirements.txt +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` **On Linux** @@ -275,6 +296,7 @@ conda install pkg-config libuv pip install mkl-static mkl-include # Add these packages if torch.distributed is needed. # Distributed package support on Windows is a prototype feature and is subject to changes. +<<<<<<< HEAD conda install -c conda-forge libuv=1.51 ``` @@ -284,22 +306,41 @@ conda install -c conda-forge libuv=1.51 If you're compiling for AMD ROCm then first run this command: +======= +conda install -c conda-forge libuv=1.39 +``` + +#### Install PyTorch +**On Linux** + +If you're compiling for AMD ROCm then first run this command: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```bash # Only run this if you're compiling for ROCm python tools/amd_build/build_amd.py ``` Install PyTorch +<<<<<<< HEAD ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" python -m pip install --no-build-isolation -v -e . +======= +```bash +export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" +python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` **On macOS** ```bash +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python3 setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` **On Windows** @@ -311,7 +352,11 @@ If you want to build legacy python code, please refer to [Building on legacy cod In this mode PyTorch computations will run on your CPU, not your GPU. ```cmd +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/main/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used. @@ -332,6 +377,10 @@ Additional libraries such as You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/main/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```cmd cmd @@ -351,7 +400,12 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\ :: [Optional] If you want to override the CUDA host compiler set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python setup.py develop + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` **Intel GPU builds** @@ -373,7 +427,11 @@ if defined CMAKE_PREFIX_PATH ( set "CMAKE_PREFIX_PATH=%CONDA_PREFIX%\Library" ) +<<<<<<< HEAD python -m pip install --no-build-isolation -v -e . +======= +python setup.py develop +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ``` ##### Adjust Build Options (Optional) @@ -383,7 +441,10 @@ the following. For example, adjusting the pre-detected directories for CuDNN or with such a step. On Linux +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" CMAKE_ONLY=1 python setup.py build @@ -391,10 +452,16 @@ ccmake build # or cmake-gui build ``` On macOS +<<<<<<< HEAD ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build +======= +```bash +export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" +MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ccmake build # or cmake-gui build ``` @@ -517,7 +584,11 @@ on [our website](https://pytorch.org/get-started/previous-versions). ## Getting Started +<<<<<<< HEAD Three pointers to get you started: +======= +Three-pointers to get you started: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - [Tutorials: get you started with understanding and using PyTorch](https://pytorch.org/tutorials/) - [Examples: easy to understand PyTorch code across all domains](https://github.com/pytorch/examples) - [The API Reference](https://pytorch.org/docs/) @@ -559,7 +630,11 @@ To learn more about making a contribution to Pytorch, please see our [Contributi PyTorch is a community-driven project with several skillful engineers and researchers contributing to it. +<<<<<<< HEAD PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means. +======= +PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch. diff --git a/RELEASE.md b/RELEASE.md index 87f042d659fdf..553896d87d9d5 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,7 +3,10 @@ - [Release Compatibility Matrix](#release-compatibility-matrix) +<<<<<<< HEAD - [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) - [Release Cadence](#release-cadence) - [General Overview](#general-overview) - [Frequently Asked Questions](#frequently-asked-questions) @@ -51,8 +54,11 @@ Following is the Release Compatibility Matrix for PyTorch releases: | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm | | --- | --- | --- | --- | --- | --- | +<<<<<<< HEAD | 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 | | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 | +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 | | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 | | 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 | @@ -64,6 +70,7 @@ Following is the Release Compatibility Matrix for PyTorch releases: | 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 | | 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 | +<<<<<<< HEAD ### PyTorch CUDA Support Matrix For Release 2.9 PyTorch Supports following CUDA Architectures: @@ -80,6 +87,8 @@ For Release 2.9 PyTorch Supports following CUDA Architectures: | 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0) | | 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) | +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ## Release Cadence Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional. @@ -92,9 +101,15 @@ Following is the release cadence. All future dates below are tentative. For late | 2.4 | Jun 2024 | Jul 2024 | Sept 2024 | Not planned | | 2.5 | Sep 2024 | Oct 2024 | Nov 2024 | Not planned | | 2.6 | Dec 2024 | Jan 2025 | Not planned | Not planned | +<<<<<<< HEAD | 2.7 | Mar 2025 | Apr 2025 | Jun 2025 | Not planned | | 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) | | 2.9 | Sept 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) | +======= +| 2.7 | Mar 2025 | Apr 2025 | (May 2025) | (Jun 2025) | +| 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) | +| 2.9 | Aug 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) | 2.10 | Dec 2025 | Jan 2026 | (Feb 2026) | (Mar 2026) | | 2.11 | Mar 2026 | Apr 2026 | (Jun 2026) | (Jul 2026) | diff --git a/SECURITY.md b/SECURITY.md index ed8228af36724..b381b749d58fc 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -16,8 +16,11 @@ However, if you believe you have found a security vulnerability in PyTorch, we e Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new +<<<<<<< HEAD All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: https://www.facebook.com/whitehat @@ -31,9 +34,15 @@ Be careful when running untrusted models. This classification includes models cr **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing). +<<<<<<< HEAD **Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details. Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs. +======= +**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done. + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance. diff --git a/android/README.md b/android/README.md index f0c74750522de..102a795fed980 100644 --- a/android/README.md +++ b/android/README.md @@ -2,7 +2,11 @@ ## Demo applications and tutorials +<<<<<<< HEAD Please refer to [meta-pytorch/executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch). +======= +Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions. diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h index 03b00cc215640..b7394a370175a 100644 --- a/aten/src/ATen/BlasBackend.h +++ b/aten/src/ATen/BlasBackend.h @@ -28,6 +28,7 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) { return stream << BlasBackendToString(backend); } +<<<<<<< HEAD namespace blas { enum class ScalingType : std::uint8_t { @@ -43,4 +44,6 @@ enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 }; } // namespace blas +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index b9ccee7db811f..520de1082b974 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -38,7 +38,11 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT) configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h") # TODO: Do not generate CUDAConfig.h for ROCm BUILDS +<<<<<<< HEAD # At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds +======= +# At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(USE_CUDA OR USE_ROCM) configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h") endif() @@ -96,8 +100,11 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") file(GLOB vulkan_cpp "vulkan/*.cpp") file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp") +<<<<<<< HEAD file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Metal file(GLOB metal_h "metal/*.h") file(GLOB metal_cpp "metal/*.cpp") @@ -121,8 +128,11 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp") file(GLOB_RECURSE native_mps_mm "native/mps/*.mm") file(GLOB_RECURSE native_mps_metal "native/mps/*.metal") file(GLOB_RECURSE native_mps_h "native/mps/*.h") +<<<<<<< HEAD file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm") file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal") +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) file(GLOB native_sparse_cpp "native/sparse/*.cpp") file(GLOB native_quantized_cpp @@ -182,6 +192,7 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip") # if USE_FLASH_ATTENTION is set, ensure CK instances get generated if(USE_FLASH_ATTENTION) +<<<<<<< HEAD if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1") message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead") caffe2_update_option(USE_ROCM_CK_SDPA ON) @@ -203,6 +214,24 @@ if(USE_FLASH_ATTENTION) add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3) file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip") list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip}) +======= + if(DEFINED ENV{USE_CK_FLASH_ATTENTION}) + set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION}) + if(USE_CK_FLASH_ATTENTION STREQUAL "1") + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS) + if(NUM_ARCHS GREATER 1) + message(WARNING "Building CK for multiple archs can increase build time considerably! + Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for") + endif() + endif() + message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled") + message(STATUS "Generating CK kernel instances...") + add_subdirectory(native/transformers/hip/flash_attn/ck) + file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip") + list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip}) + endif() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip") file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip") @@ -216,7 +245,11 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp}) +<<<<<<< HEAD target_include_directories(flash_attention SYSTEM PUBLIC +======= + target_include_directories(flash_attention PUBLIC +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include ${PROJECT_SOURCE_DIR}/third_party/cutlass/include @@ -252,6 +285,7 @@ if(USE_MEM_EFF_ATTENTION) list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu}) endif() +<<<<<<< HEAD IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF") set(USE_FBGEMM_GENAI off) @@ -363,6 +397,8 @@ IF(USE_FBGEMM_GENAI) endif() endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # XNNPACK file(GLOB native_xnnpack "native/xnnpack/*.cpp") @@ -410,9 +446,12 @@ if(USE_VULKAN) else() set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp}) endif() +<<<<<<< HEAD if(USE_EIGEN_SPARSE) set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp}) endif() +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if(USE_MTIA) set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h}) @@ -491,6 +530,7 @@ if(USE_CUDA) endif() if(USE_ROCM) +<<<<<<< HEAD if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM) # NOTE: The PyTorch build does not actually add_subdirectory # third_party/composable_kernel or use it as a CMake library. What is used @@ -520,13 +560,46 @@ if(USE_ROCM) list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include) _pytorch_rocm_generate_ck_conf() endif() +======= + # NOTE: The PyTorch build does not actually add_subdirectory + # third_party/composable_kernel or use it as a CMake library. What is used + # is header only, so this should be ok, except that the CMake build generates + # a ck/config.h. We just do that part here. Without this, the ck.h from the + # ROCM SDK may get accidentally used instead. + function(_pytorch_rocm_generate_ck_conf) + set(CK_ENABLE_INT8 "ON") + set(CK_ENABLE_FP16 "ON") + set(CK_ENABLE_FP32 "ON") + set(CK_ENABLE_FP64 "ON") + set(CK_ENABLE_BF16 "ON") + set(CK_ENABLE_FP8 "ON") + set(CK_ENABLE_BF8 "ON") + set(CK_USE_XDL "ON") + set(CK_USE_WMMA "ON") + configure_file( + "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" + ) + endfunction() + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) + _pytorch_rocm_generate_ck_conf() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # Next two lines are needed because TunableOp uses third-party/fmt list(APPEND ATen_HIP_INCLUDE $) list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only) +<<<<<<< HEAD if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck) endif() +======= +if(USE_FLASH_ATTENTION) + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck) +endif() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) list(APPEND ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} @@ -536,13 +609,20 @@ if(USE_ROCM) ${native_quantized_hip_hip} ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} ) +<<<<<<< HEAD if(NOT USE_ROCM_CK_GEMM) +======= + if(WIN32) # Windows doesn't support Composable Kernels +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") file(GLOB native_hip_ck "native/hip/ck*.hip") exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" ${native_hip_bgemm} ${native_hip_ck}) endif() +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) list(APPEND all_hip_cpp ${native_nested_hip_cpp} @@ -581,7 +661,11 @@ if(LAPACK_FOUND) # would not need this at all), some of our libraries (magma in particular) # backend to CPU BLAS/LAPACK implementations, and so it is very important # we get the *right* implementation, because even if the symbols are the +<<<<<<< HEAD # same, LAPACK implementations may have different calling conventions. +======= + # same, LAPACK implementions may have different calling conventions. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) # This caused https://github.com/pytorch/pytorch/issues/7353 # # We do NOT do this on Linux, since we just rely on torch_cpu to @@ -621,11 +705,14 @@ if(UNIX) if(HAVE_MALLOC_USABLE_SIZE) add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1) endif(HAVE_MALLOC_USABLE_SIZE) +<<<<<<< HEAD set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h") CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE) if(HAVE_POSIX_FALLOCATE) add_definitions(-DHAVE_POSIX_FALLOCATE=1) endif(HAVE_POSIX_FALLOCATE) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif(UNIX) ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC) @@ -707,6 +794,7 @@ if(USE_CUDA AND NOT USE_ROCM) add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include) +<<<<<<< HEAD if($ENV{ATEN_STATIC_CUDA}) if(CUDA_VERSION VERSION_LESS_EQUAL 12.9) @@ -726,6 +814,26 @@ if(USE_CUDA AND NOT USE_ROCM) CUDA::cusolver_static ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static ) +======= + if($ENV{ATEN_STATIC_CUDA}) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + CUDA::cusparse_static + CUDA::cufft_static_nocallback + ) + if(NOT BUILD_LAZY_CUDA_LINALG) + if(CUDA_VERSION_MAJOR LESS_EQUAL 11) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + CUDA::cusolver_static + ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a # needed for libcusolver_static + ) + elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + CUDA::cusolver_static + ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static + ) + endif() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) endif() else() list(APPEND ATen_CUDA_DEPENDENCY_LIBS @@ -790,6 +898,7 @@ endif() if(USE_MPS) include(../../../cmake/Metal.cmake) +<<<<<<< HEAD set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm}) if(CAN_COMPILE_METAL) @@ -809,6 +918,31 @@ if(USE_MPS) else() file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps") foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal}) +======= + set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h}) + + if(CAN_COMPILE_METAL) + foreach(SHADER ${native_mps_metal}) + cmake_path(GET SHADER STEM TGT_STEM) + string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air") + string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air") + list(APPEND AIR_BASIC ${TGT_BASIC}) + list(APPEND AIR_BFLOAT ${TGT_BFLOAT}) + metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0") + metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1") + endforeach() + air_to_metallib(kernels_basic.metallib ${AIR_BASIC}) + air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT}) + add_custom_command( + COMMAND echo "// $$(date)" > metallib_dummy.cpp + DEPENDS kernels_basic.metallib kernels_bfloat.metallib + OUTPUT metallib_dummy.cpp + COMMENT "Updating metallibs timestamp") + add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp) + else() + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps") + foreach(SHADER ${native_mps_metal}) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cmake_path(GET SHADER STEM TGT_STEM) string(CONCAT SHADER_HDR_NAME "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h") metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME}) diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 8b283556c7a43..4e3b090f7bfdd 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -144,7 +144,12 @@ inline std::string _all_equal_numel_error(at::ArrayRef tensors) { inline bool _apply_preamble(ArrayRef tensors) { checkDeviceType("CPU_tensor_apply", tensors, kCPU); checkLayout("CPU_tensor_apply", tensors, kStrided); +<<<<<<< HEAD TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors)); +======= + if (!_all_equal_numel(tensors)) + TORCH_CHECK(false, _all_equal_numel_error(tensors)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // An empty tensor has no elements for (auto& t : tensors) if (t.numel() == 0) diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp index 44ad24b81755f..f27391e7ee73c 100644 --- a/aten/src/ATen/CPUGeneratorImpl.cpp +++ b/aten/src/ATen/CPUGeneratorImpl.cpp @@ -131,18 +131,36 @@ uint64_t CPUGeneratorImpl::seed() { /** * Sets the internal state of CPUGeneratorImpl. The new internal state +<<<<<<< HEAD * must be a strided CPU byte tensor and of the same size as CPUGeneratorImplState. +======= + * must be a strided CPU byte tensor and of the same size as either + * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or + * CPUGeneratorImplState (for new state). + * + * FIXME: Remove support of the legacy state in the future? +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) */ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { using detail::CPUGeneratorImplState; using detail::CPUGeneratorImplStateLegacy; +<<<<<<< HEAD static_assert(std::is_standard_layout_v, "CPUGeneratorImplState is not a PODType"); constexpr size_t size = sizeof(CPUGeneratorImplState); +======= + static_assert(std::is_standard_layout_v, "CPUGeneratorImplStateLegacy is not a PODType"); + static_assert(std::is_standard_layout_v, "CPUGeneratorImplState is not a PODType"); + + static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy); + static const size_t size_current = sizeof(CPUGeneratorImplState); + static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) detail::check_rng_state(new_state); at::mt19937 engine; +<<<<<<< HEAD auto new_state_size = new_state.numel(); TORCH_CHECK(new_state_size == size, "Expected a CPUGeneratorImplState of size ", size, @@ -150,6 +168,51 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { auto rng_state = new_state.data_ptr_impl(); auto legacy_pod = &(rng_state->legacy_pod); +======= + auto float_normal_sample = std::optional(); + auto double_normal_sample = std::optional(); + + // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. + CPUGeneratorImplStateLegacy* legacy_pod{nullptr}; + auto new_state_size = new_state.numel(); + if (new_state_size == size_legacy) { + legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data(); + // Note that in CPUGeneratorImplStateLegacy, we didn't have float version + // of normal sample and hence we leave the std::optional as is + + // Update next_double_normal_sample. + // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y) + // and a rho value (normal_rho). These three values were redundant and in the new + // DistributionsHelper.h, we store the actual extra normal sample, rather than three + // intermediate values. + if (legacy_pod->normal_is_valid) { + auto r = legacy_pod->normal_rho; + auto theta = 2.0 * c10::pi * legacy_pod->normal_x; + // we return the sin version of the normal sample when in caching mode + double_normal_sample = std::optional(r * ::sin(theta)); + } + } else if (new_state_size == size_current) { + auto rng_state = (CPUGeneratorImplState*)new_state.data(); + legacy_pod = &rng_state->legacy_pod; + // update next_float_normal_sample + if (rng_state->is_next_float_normal_sample_valid) { + float_normal_sample = std::optional(rng_state->next_float_normal_sample); + } + + // Update next_double_normal_sample. + // Note that in getRNGState, we now return the actual normal sample in normal_y + // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho + // are squashed to 0.0. + if (legacy_pod->normal_is_valid) { + double_normal_sample = std::optional(legacy_pod->normal_y); + } + } else { + TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy, + " or a CPUGeneratorImplState of size ", size_current, + " but found the input RNG state size to be ", new_state_size); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // construct engine_ // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are @@ -163,12 +226,17 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { engine.set_data(rng_data); TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state"); this->engine_ = engine; +<<<<<<< HEAD this->next_float_normal_sample_ = rng_state->is_next_float_normal_sample_valid ? std::optional(rng_state->next_float_normal_sample) : std::optional(); this->next_double_normal_sample_ = legacy_pod->normal_is_valid ? std::optional(legacy_pod->normal_y) : std::optional(); +======= + this->next_float_normal_sample_ = float_normal_sample; + this->next_double_normal_sample_ = double_normal_sample; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /** diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in index 0bae6d4af6e5e..c4475dc390fce 100644 --- a/aten/src/ATen/Config.h.in +++ b/aten/src/ATen/Config.h.in @@ -20,4 +20,7 @@ #define AT_BLAS_F2C() @AT_BLAS_F2C@ #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@ #define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@ +<<<<<<< HEAD #define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 3310abfb41d54..e91a2abb8a710 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -14,13 +14,18 @@ #include #ifdef USE_FBGEMM +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") #include C10_DIAGNOSTIC_POP() +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // USE_FBGEMM #if defined(__aarch64__) && !defined(C10_MOBILE) #include #endif +<<<<<<< HEAD namespace at { namespace { @@ -99,6 +104,11 @@ std::string precision2str(Float32Precision prec) { TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast(prec), ")"); } +======= + +namespace at { + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Context::Context() = default; // TODO: This could be bad juju if someone calls globalContext() in the @@ -192,6 +202,7 @@ void Context::setUserEnabledNNPACK(bool e) { enabled_nnpack = e; } +<<<<<<< HEAD bool Context::allowTF32CuDNN(std::optional op) const { if (!op.has_value()) { bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32; @@ -207,14 +218,21 @@ bool Context::allowTF32CuDNN(std::optional op) const { return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32; } warn_deprecated_fp32_precision_api(); +======= +bool Context::allowTF32CuDNN() const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return allow_tf32_cudnn; } void Context::setAllowTF32CuDNN(bool b) { +<<<<<<< HEAD setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE); setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE); allow_tf32_cudnn = b; warn_deprecated_fp32_precision_api(); +======= + allow_tf32_cudnn = b; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void Context::setSDPPriorityOrder(const std::vector& order) { @@ -235,6 +253,7 @@ bool Context::allowTF32OneDNN() const { return allow_tf32_onednn; } +<<<<<<< HEAD // NOLINTNEXTLINE(clang-diagnostic-unused-parameter) void Context::setAllowTF32OneDNN(bool b){ #ifdef USE_XPU @@ -242,6 +261,14 @@ bool Context::allowTF32OneDNN() const { #else TORCH_WARN("TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support."); #endif +======= +void Context::setAllowTF32OneDNN(bool b){ +#ifdef USE_XPU + allow_tf32_onednn = b; +#else + TORCH_WARN("TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support."); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool Context::userEnabledFlashSDP() const { @@ -292,6 +319,48 @@ bool Context::userEnabledOverrideableSDP() const { return enabled_overrideable; } +<<<<<<< HEAD +======= +static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG"; +static constexpr const std::array cublas_deterministic_configs = {":4096:8", ":16:8"}; +#ifdef USE_ROCM +static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32"; +#endif + +bool Context::checkCuBLASConfigDeterministic() { + // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config + // is set to deterministic setting + if (hasCUDART()) { + const auto workspace_config = c10::utils::get_env(cublas_config_var_name); + return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]); + } + return true; +} + +void Context::alertCuBLASConfigNotDeterministic() const { + static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic(); + if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) { + return; + } + + auto msg = c10::str( + "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ", + "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ", + "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ", + "case, you must set an environment variable before running your PyTorch application: ", + cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ", + cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ", + "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility" + ); + + if (deterministicAlgorithmsWarnOnly()) { + TORCH_WARN(msg); + } else { + TORCH_CHECK(false, msg); + } +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool Context::benchmarkCuDNN() const { return benchmark_cudnn; } @@ -308,6 +377,7 @@ void Context::setBenchmarkLimitCuDNN(int b) { benchmark_limit_cudnn = b; } +<<<<<<< HEAD bool Context::immediateMiopen() const { return immediate_miopen; } @@ -373,10 +443,41 @@ Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) return Float32Precision::NONE; } return precision; +======= +bool Context::allowTF32CuBLAS() const { +#ifdef USE_ROCM + const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); + if (allow_tf32 != true) { + return false; + } +#endif + return float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; +} + +void Context::setAllowTF32CuBLAS(bool b) { +#ifdef USE_ROCM + const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); + if (allow_tf32 != true) { + C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. " + << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it."; + return; + } +#endif + float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; +} + +Float32MatmulPrecision Context::float32MatmulPrecision() const { + return float32_matmul_precision; +} + +void Context::setFloat32MatmulPrecision(Float32MatmulPrecision p) { + float32_matmul_precision = p; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void Context::setFloat32MatmulPrecision(const std::string &s) { auto match = [this](const std::string & s_) { +<<<<<<< HEAD warn_deprecated_fp32_precision_api(); // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention if (s_ == "highest") { @@ -393,6 +494,17 @@ void Context::setFloat32MatmulPrecision(const std::string &s) { float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM; setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32); setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16); +======= + // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention + if (s_ == "highest") { + float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; + return true; + } else if (s_ == "high") { + float32_matmul_precision = at::Float32MatmulPrecision::HIGH; + return true; + } else if (s_ == "medium") { + float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } return false; @@ -406,6 +518,7 @@ void Context::setFloat32MatmulPrecision(const std::string &s) { "setFloat32MatmulPrecision call has no effect."); } +<<<<<<< HEAD void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) { auto it = fp32_precision.find(std::make_pair(backend, op)); TORCH_CHECK( @@ -418,6 +531,8 @@ void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32P it->second = p; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::LinalgBackend Context::linalgPreferredBackend() const { return linalg_preferred_backend; } @@ -442,9 +557,12 @@ at::BlasBackend Context::blasPreferredBackend() { // call site for blasPreferredBackend(), we set it to an actual value. if (blas_preferred_backend == at::BlasBackend::Default) { blas_preferred_backend = at::BlasBackend::Cublas; +<<<<<<< HEAD // This logic sits in the getter because it needs to validate // values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT // which initialize the backend without calling the setter +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef USE_ROCM // AMD Instinct targets prefer hipblaslt static const bool hipblaslt_preferred = []() { @@ -453,6 +571,12 @@ at::BlasBackend Context::blasPreferredBackend() { #if ROCM_VERSION >= 60400 "gfx1200", "gfx1201", #endif +<<<<<<< HEAD +======= +#if ROCM_VERSION >= 60402 + "gfx1150", "gfx1151", +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if ROCM_VERSION >= 60500 "gfx950" #endif @@ -474,6 +598,7 @@ at::BlasBackend Context::blasPreferredBackend() { // hipblaslt support for all archs is not as complete as hipblas if (blas_preferred_backend == at::BlasBackend::Cublaslt) { static const bool hipblaslt_unsupported = []() { +<<<<<<< HEAD if(!hasCuBLASLt()) { return true; @@ -485,6 +610,18 @@ at::BlasBackend Context::blasPreferredBackend() { #endif #if ROCM_VERSION >= 70000 "gfx950", "gfx1150", "gfx1151" +======= + static const std::vector archs = { + "gfx90a", "gfx942", +#if ROCM_VERSION >= 60300 + "gfx1100", "gfx1101", "gfx1200", "gfx1201", +#endif +#if ROCM_VERSION >= 60402 + "gfx1150", "gfx1151", +#endif +#if ROCM_VERSION >= 60500 + "gfx950" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif }; for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { @@ -503,6 +640,7 @@ at::BlasBackend Context::blasPreferredBackend() { return blas_preferred_backend; } +<<<<<<< HEAD bool Context::ckSupported() { #ifdef USE_ROCM static const std::vector supported_archs = { @@ -521,6 +659,8 @@ bool Context::ckSupported() { #endif } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void Context::setBlasPreferredBackend(at::BlasBackend b) { #ifdef _MSC_VER TORCH_WARN_ONCE( @@ -530,6 +670,7 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) { #else TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(), "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt."); +<<<<<<< HEAD #ifdef USE_ROCM static const bool ckSupportedFlag = ckSupported(); static const bool hasCKGEMMFlag = hasCKGEMM(); @@ -538,6 +679,10 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) { "architecture supported for CK: ", ckSupportedFlag, ", PyTorch built with CK GEMM support: ", hasCKGEMMFlag); #endif +======= + TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(), + "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm."); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) { TORCH_WARN_ONCE( "torch.backends.cuda.preferred_blas_library is an experimental feature. " @@ -549,6 +694,7 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) { #endif } +<<<<<<< HEAD at::ROCmFABackend Context::getROCmFAPreferredBackend() { #ifdef USE_ROCM // Set potential "Default" value so we don't have to interpret at call sites. @@ -572,10 +718,14 @@ at::ROCmFABackend Context::getROCmFAPreferredBackend() { } #endif +======= +at::ROCmFABackend Context::getROCmFAPreferredBackend() const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return rocm_fa_preferred_backend; } void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) { +<<<<<<< HEAD #ifdef USE_ROCM static const bool hasCKSDPAFlag = hasCKSDPA(); static const bool ckSupportedFlag = ckSupported(); @@ -583,10 +733,37 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) { "Cannot set preferred SDPA backend to CK since following conditions are not true: ", "architecture supported for CK: ", ckSupportedFlag, ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag); +======= + + // TODO: add plumbing for hasCK for validity checking + TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(), + "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm."); +#ifdef USE_ROCM + if(b == at::ROCmFABackend::Ck) { + static const bool ck_unsupported = []() { + static const std::vector archs = { + "gfx90a", "gfx942", "gfx950" + }; + for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { + if (!detail::getCUDAHooks().isGPUArch(archs, index)) { + TORCH_WARN_ONCE( + "Attempting to use CK on an unsupported architecture! Cannot set backend to CK"); + return true; + } + } + return false; + }(); + if(!ck_unsupported) rocm_fa_preferred_backend = b; + } + else { + rocm_fa_preferred_backend = b; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif rocm_fa_preferred_backend = b; } +<<<<<<< HEAD CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const { return allow_fp16_reduction_cublas; } @@ -614,6 +791,22 @@ CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const { void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) { allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk); +======= +bool Context::allowFP16ReductionCuBLAS() const { + return allow_fp16_reduction_cublas; +} + +void Context::setAllowFP16ReductionCuBLAS(bool b) { + allow_fp16_reduction_cublas = b; +} + +bool Context::allowBF16ReductionCuBLAS() const { + return allow_bf16_reduction_cublas; +} + +void Context::setAllowBF16ReductionCuBLAS(bool b) { + allow_bf16_reduction_cublas = b; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool Context::allowFP16AccumulationCuBLAS() const { @@ -673,6 +866,7 @@ bool Context::hasLAPACK() { #endif } +<<<<<<< HEAD bool Context::hasEigenSparse() { #if AT_USE_EIGEN_SPARSE() return true; @@ -681,6 +875,8 @@ bool Context::hasEigenSparse() { #endif } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::QEngine Context::qEngine() const { static auto _quantized_engine = []() { at::QEngine qengine = at::kNoQEngine; @@ -704,14 +900,22 @@ at::QEngine Context::qEngine() const { #endif return qengine; }(); +<<<<<<< HEAD auto qt_engine = quantized_engine.load(); return qt_engine == at::QEngine::NoQEngine ? _quantized_engine : qt_engine; +======= + return quantized_engine.value_or(_quantized_engine); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void Context::setQEngine(at::QEngine e) { const auto& qengines = supportedQEngines(); if (std::find(qengines.begin(), qengines.end(), e) != qengines.end()) { +<<<<<<< HEAD quantized_engine.store(e); +======= + quantized_engine = e; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return; } TORCH_CHECK(false, "quantized engine ", toString(e), " is not supported"); @@ -723,9 +927,23 @@ const std::vector& Context::supportedQEngines() { // Engines are listed in priority order: later one wins // By default we prefer FBGEMM if we're running on server side // QNNPACK on server side has some issue, so we disable it by default. +<<<<<<< HEAD +#ifdef USE_PYTORCH_QNNPACK + engines.push_back(at::kQNNPACK); +#endif +======= +#ifdef C10_MOBILE + engines.push_back(at::kNoQEngine); +#ifdef USE_PYTORCH_QNNPACK + engines.push_back(at::kQNNPACK); +#endif +#else // C10_MOBILE #ifdef USE_PYTORCH_QNNPACK engines.push_back(at::kQNNPACK); #endif + engines.push_back(at::kNoQEngine); +#endif // C10_MOBILE +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if AT_MKLDNN_ENABLED() engines.push_back(at::kONEDNN); @@ -857,7 +1075,10 @@ void Context::setAllowFP16ReductionCPU(bool b) { #if defined(__aarch64__) && !defined(C10_MOBILE) if (!cpuinfo_initialize() || !cpuinfo_has_arm_fp16_arith()) #else +<<<<<<< HEAD // NOLINTNEXTLINE(facebook-hte-MissingBraces) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (true) #endif TORCH_CHECK(false, "Float16 arithmetic is not supported by the CPU!"); diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index a4a26b5671e59..5854e827a2572 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -19,13 +19,17 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include #include #include #include +<<<<<<< HEAD #include #include @@ -33,6 +37,12 @@ #include #include #include +======= +#include + +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { @@ -40,6 +50,7 @@ class Tensor; enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM }; +<<<<<<< HEAD enum class CuBLASReductionOption : uint8_t { AllowReducedPrecisionWithSplitK = 0, DisallowReducedPrecisionAllowSplitK = 1, @@ -54,6 +65,8 @@ TORCH_API Float32Op str2op(const std::string& name); TORCH_API Float32Precision str2precision(const std::string& name); TORCH_API std::string precision2str(Float32Precision prec); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class TORCH_API Context { public: Context(); @@ -89,8 +102,11 @@ class TORCH_API Context { return at::detail::getHIPHooks(); } else if (opt_device_type == at::kHPU) { return at::detail::getHPUHooks(); +<<<<<<< HEAD } else if (opt_device_type == at::kXLA) { return at::detail::getXLAHooks(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { TORCH_CHECK( false, @@ -151,8 +167,11 @@ class TORCH_API Context { static bool hasKleidiAI(); static bool hasLAPACK(); static bool hasMKLDNN(); +<<<<<<< HEAD static bool ckSupported(); static bool hasEigenSparse(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static bool hasMAGMA() { return detail::getCUDAHooks().hasMAGMA(); } @@ -183,12 +202,15 @@ class TORCH_API Context { static bool hasROCM() { return detail::getCUDAHooks().hasROCM(); } +<<<<<<< HEAD static bool hasCKSDPA() { return detail::getCUDAHooks().hasCKSDPA(); } static bool hasCKGEMM() { return detail::getCUDAHooks().hasCKGEMM(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static bool hasHIP() { return detail::getHIPHooks().hasHIP(); } @@ -199,7 +221,11 @@ class TORCH_API Context { return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU); } static bool hasXLA() { +<<<<<<< HEAD return detail::getXLAHooks().hasXLA(); +======= + return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static bool hasXPU() { return detail::getXPUHooks().hasXPU(); @@ -229,6 +255,7 @@ class TORCH_API Context { bool userEnabledMkldnn() const; void setUserEnabledMkldnn(bool e); bool benchmarkCuDNN() const; +<<<<<<< HEAD void setBenchmarkCuDNN(bool /*b*/); int benchmarkLimitCuDNN() const; void setBenchmarkLimitCuDNN(int /*b*/); @@ -238,6 +265,15 @@ class TORCH_API Context { void setDeterministicCuDNN(bool /*b*/); bool deterministicMkldnn() const; void setDeterministicMkldnn(bool /*b*/); +======= + void setBenchmarkCuDNN(bool); + int benchmarkLimitCuDNN() const; + void setBenchmarkLimitCuDNN(int); + bool deterministicCuDNN() const; + void setDeterministicCuDNN(bool); + bool deterministicMkldnn() const; + void setDeterministicMkldnn(bool); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool userEnabledNNPACK() const; void setUserEnabledNNPACK(bool e); @@ -255,6 +291,7 @@ class TORCH_API Context { void setSDPPriorityOrder(const std::vector& order); std::array sDPPriorityOrder(); +<<<<<<< HEAD void setSDPUseFlash(bool /*e*/); bool userEnabledFlashSDP() const; @@ -281,6 +318,34 @@ class TORCH_API Context { at::ROCmFABackend getROCmFAPreferredBackend(); void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/); +======= + void setSDPUseFlash(bool); + bool userEnabledFlashSDP() const; + + void setSDPUseMemEfficient(bool); + bool userEnabledMemEfficientSDP() const; + + void setSDPUseMath(bool); + bool userEnabledMathSDP() const; + + void setSDPUseCuDNN(bool); + bool userEnabledCuDNNSDP() const; + + void setAllowFP16BF16ReductionMathSDP(bool); + bool allowFP16BF16ReductionMathSDP() const; + + void setSDPUseOverrideable(bool); + bool userEnabledOverrideableSDP() const; + + at::LinalgBackend linalgPreferredBackend() const; + void setLinalgPreferredBackend(at::LinalgBackend); + + at::BlasBackend blasPreferredBackend(); + void setBlasPreferredBackend(at::BlasBackend); + + at::ROCmFABackend getROCmFAPreferredBackend() const; + void setROCmFAPreferredBackend(at::ROCmFABackend); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [Enabling Deterministic Operations] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -313,9 +378,15 @@ class TORCH_API Context { bool deterministicAlgorithms() const; bool deterministicAlgorithmsWarnOnly() const; +<<<<<<< HEAD void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/); bool deterministicFillUninitializedMemory() const; void setDeterministicFillUninitializedMemory(bool /*b*/); +======= + void setDeterministicAlgorithms(bool, bool); + bool deterministicFillUninitializedMemory() const; + void setDeterministicFillUninitializedMemory(bool); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [Writing Nondeterministic Operations] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -329,7 +400,17 @@ class TORCH_API Context { // // * Throw an error when `Context::deterministicAlgorithms()` is true. Most // of the time, this should be accomplished by calling +<<<<<<< HEAD // `at::globalContext().alertNotDeterminstic(). +======= + // `at::globalContext().alertNotDeterminstic()`. However, if the + // nondeterministic behavior is caused by the CuBLAS workspace + // configuration in CUDA >= 10.2, + // `at::globalContext().alertCuBLASConfigNotDeterministic()` should be + // called instead (in this case, a comment explaining why the operation is + // nondeterministic is not necessary). See below for details on these + // methods. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // * Have an entry in the list of nondeterministic PyTorch operations in the // docstring of `use_deterministic_algorithms()` in torch/__init__.py @@ -353,6 +434,7 @@ class TORCH_API Context { // Throws an error if `Context::deterministicAlgorithms()` is true static void alertNotDeterministic(std::string_view const& caller); +<<<<<<< HEAD void setFloat32MatmulPrecision(const std::string& s); void setFloat32Precision( Float32Backend backend, @@ -376,6 +458,29 @@ class TORCH_API Context { bool allow_splitk = true); bool allowFP16AccumulationCuBLAS() const; void setAllowFP16AccumulationCuBLAS(bool /*b*/); +======= + // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA + // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or + // ":4096:8". For more details: + // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility + void alertCuBLASConfigNotDeterministic() const; + + void setFloat32MatmulPrecision(const std::string& s); + bool allowTF32CuDNN() const; + void setAllowTF32CuDNN(bool); + bool allowTF32OneDNN() const; + void setAllowTF32OneDNN(bool); + bool allowTF32CuBLAS() const; + void setAllowTF32CuBLAS(bool); + Float32MatmulPrecision float32MatmulPrecision() const; + void setFloat32MatmulPrecision(Float32MatmulPrecision p); + bool allowFP16ReductionCuBLAS() const; + void setAllowFP16ReductionCuBLAS(bool); + bool allowBF16ReductionCuBLAS() const; + void setAllowBF16ReductionCuBLAS(bool); + bool allowFP16AccumulationCuBLAS() const; + void setAllowFP16AccumulationCuBLAS(bool); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Matmuls can use a so-called "persistent" kernel which launches one CUDA // block for each SM on the GPU, and each block then iterates over multiple @@ -387,7 +492,11 @@ class TORCH_API Context { // to make matmuls target only a subset of the SMs, so they can fully schedule // even next to a comms kernel, and only be a few percent slower. std::optional _SMCarveout_EXPERIMENTAL() const; +<<<<<<< HEAD void _setSMCarveout_EXPERIMENTAL(std::optional /*c*/); +======= + void _setSMCarveout_EXPERIMENTAL(std::optional); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::QEngine qEngine() const; void setQEngine(at::QEngine e); @@ -408,7 +517,11 @@ class TORCH_API Context { void setDefaultMobileCPUAllocator(); void unsetDefaultMobileCPUAllocator(); bool allowFP16ReductionCPU() const; +<<<<<<< HEAD void setAllowFP16ReductionCPU(bool /*b*/); +======= + void setAllowFP16ReductionCPU(bool); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Preserved for BC void lazyInitCUDA() { @@ -438,6 +551,10 @@ class TORCH_API Context { } private: +<<<<<<< HEAD +======= + static bool checkCuBLASConfigDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::array init_; bool enabled_cudnn = true; bool deterministic_cudnn = false; @@ -449,8 +566,12 @@ class TORCH_API Context { at::SDPBackend::flash_attention, at::SDPBackend::efficient_attention, at::SDPBackend::math, +<<<<<<< HEAD at::SDPBackend::cudnn_attention, at::SDPBackend::overrideable}; +======= + at::SDPBackend::cudnn_attention}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool enabled_flashSDP = true; bool enabled_mem_efficientSDP = true; bool enabled_mathSDP = true; @@ -458,17 +579,25 @@ class TORCH_API Context { bool enabled_overrideable = true; bool allow_fp16_bf16_reduction_mathSDP = false; bool benchmark_cudnn = false; +<<<<<<< HEAD bool immediate_miopen = false; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Float32MatmulPrecision float32_matmul_precision = c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; int benchmark_limit_cudnn = 10; bool allow_tf32_cudnn = true; +<<<<<<< HEAD CuBLASReductionOption allow_fp16_reduction_cublas = CuBLASReductionOption::AllowReducedPrecisionWithSplitK; CuBLASReductionOption allow_bf16_reduction_cublas = CuBLASReductionOption::AllowReducedPrecisionWithSplitK; +======= + bool allow_fp16_reduction_cublas = true; + bool allow_bf16_reduction_cublas = true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool allow_fp16_accumulation_cublas = false; std::optional sm_carveout = std::nullopt; bool enabled_mkldnn = true; @@ -494,6 +623,7 @@ class TORCH_API Context { bool release_original_weights = false; #endif bool display_vmap_fallback_warnings_ = false; +<<<<<<< HEAD std::atomic quantized_engine = at::QEngine::NoQEngine; bool enable_sparse_tensor_invariant_checks = false; bool allow_fp16_reduction_cpu = false; @@ -514,6 +644,12 @@ class TORCH_API Context { : Float32Precision::TF32}, }; +======= + std::optional quantized_engine = std::nullopt; + bool enable_sparse_tensor_invariant_checks = false; + bool allow_fp16_reduction_cpu = false; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Allocator* prev_allocator_ptr_{nullptr}; }; @@ -625,10 +761,13 @@ inline bool hasLAPACK() { return globalContext().hasLAPACK(); } +<<<<<<< HEAD inline bool hasEigenSparse() { return globalContext().hasEigenSparse(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline bool hasMAGMA() { return globalContext().hasMAGMA(); } @@ -693,4 +832,8 @@ struct TORCH_API ROCmBackwardPassGuard { ~ROCmBackwardPassGuard(); static bool is_backward_pass(); }; +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index ccb0ae15a11e6..1bbf2285f39f7 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -65,6 +65,7 @@ DLDataType getDLDataType(const Tensor& t) { break; // TODO(#146647): use macro here instead of spelling out each shell dtype case ScalarType::Float8_e5m2: +<<<<<<< HEAD dtype.code = DLDataTypeCode::kDLFloat8_e5m2; break; case ScalarType::Float8_e5m2fnuz: @@ -83,29 +84,53 @@ DLDataType getDLDataType(const Tensor& t) { dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn; dtype.lanes = 2; dtype.bits = 4; +======= + case ScalarType::Float8_e5m2fnuz: + case ScalarType::Float8_e4m3fn: + case ScalarType::Float8_e4m3fnuz: + case ScalarType::Float8_e8m0fnu: + TORCH_CHECK(false, "float8 types are not supported by dlpack"); + break; + case ScalarType::Float4_e2m1fn_x2: + TORCH_CHECK(false, "float4 types are not supported by dlpack"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) break; case ScalarType::QInt8: case ScalarType::QUInt8: case ScalarType::QInt32: case ScalarType::QUInt4x2: case ScalarType::QUInt2x4: +<<<<<<< HEAD TORCH_CHECK_BUFFER(false, "QUInt/QInt types are not supported by dlpack"); +======= + TORCH_CHECK(false, "QUInt/QInt types are not supported by dlpack"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) break; case ScalarType::Bits1x8: case ScalarType::Bits2x4: case ScalarType::Bits4x2: case ScalarType::Bits8: case ScalarType::Bits16: +<<<<<<< HEAD TORCH_CHECK_BUFFER(false, "Bit types are not supported by dlpack"); break; case ScalarType::Undefined: TORCH_CHECK_BUFFER(false, "Undefined is not a valid ScalarType"); case ScalarType::NumOptions: TORCH_CHECK_BUFFER(false, "NumOptions is not a valid ScalarType"); +======= + TORCH_CHECK(false, "Bit types are not supported by dlpack"); + break; + case ScalarType::Undefined: + TORCH_CHECK(false, "Undefined is not a valid ScalarType"); + case ScalarType::NumOptions: + TORCH_CHECK(false, "NumOptions is not a valid ScalarType"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return dtype; } +<<<<<<< HEAD DLDevice torchDeviceToDLDevice(at::Device device) { DLDevice ctx; @@ -114,6 +139,12 @@ DLDevice torchDeviceToDLDevice(at::Device device) { : 0; switch (device.type()) { +======= +static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) { + DLDevice ctx; + ctx.device_id = static_cast(static_cast(device_id)); + switch (tensor.device().type()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case DeviceType::CPU: ctx.device_type = DLDeviceType::kDLCPU; break; @@ -134,7 +165,12 @@ DLDevice torchDeviceToDLDevice(at::Device device) { break; case DeviceType::XPU: ctx.device_type = DLDeviceType::kDLOneAPI; +<<<<<<< HEAD ctx.device_id = at::detail::getXPUHooks().getGlobalIdxFromDevice(device); +======= + ctx.device_id = + at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) break; case DeviceType::MAIA: ctx.device_type = DLDeviceType::kDLMAIA; @@ -142,6 +178,7 @@ DLDevice torchDeviceToDLDevice(at::Device device) { case DeviceType::PrivateUse1: ctx.device_type = DLDeviceType::kDLExtDev; break; +<<<<<<< HEAD case DeviceType::MPS: ctx.device_type = DLDeviceType::kDLMetal; break; @@ -154,11 +191,22 @@ DLDevice torchDeviceToDLDevice(at::Device device) { static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* data = nullptr) { switch (type) { +======= + default: + TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str()); + } + return ctx; +} + +static Device getATenDevice(const DLDevice& ctx, void* data) { + switch (ctx.device_type) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case DLDeviceType::kDLCPU: return at::Device(DeviceType::CPU); #ifndef USE_ROCM // if we are compiled under HIP, we cannot do cuda case DLDeviceType::kDLCUDA: +<<<<<<< HEAD return at::Device(DeviceType::CUDA, index); #endif case DLDeviceType::kDLOpenCL: @@ -182,16 +230,42 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat default: TORCH_CHECK_BUFFER( false, "Unsupported device_type: ", std::to_string(type)); +======= + return at::Device(DeviceType::CUDA, static_cast(ctx.device_id)); +#endif + case DLDeviceType::kDLOpenCL: + return at::Device(DeviceType::OPENCL, static_cast(ctx.device_id)); + case DLDeviceType::kDLROCM: +#ifdef USE_ROCM + // this looks funny, we need to return CUDA here to masquerade + return at::Device(DeviceType::CUDA, static_cast(ctx.device_id)); +#else + return at::Device(DeviceType::HIP, static_cast(ctx.device_id)); +#endif + case DLDeviceType::kDLOneAPI: + return at::detail::getXPUHooks().getDeviceFromPtr(data); + case DLDeviceType::kDLMAIA: + return at::Device(DeviceType::MAIA, static_cast(ctx.device_id)); + case DLDeviceType::kDLExtDev: + return at::Device(DeviceType::PrivateUse1, static_cast(ctx.device_id)); + default: + TORCH_CHECK( + false, "Unsupported device_type: ", std::to_string(ctx.device_type)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } ScalarType toScalarType(const DLDataType& dtype) { ScalarType stype = ScalarType::Undefined; +<<<<<<< HEAD if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) { TORCH_CHECK_BUFFER( dtype.lanes == 1, "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code)); } +======= + TORCH_CHECK(dtype.lanes == 1, "ATen does not support lanes != 1"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (dtype.code) { case DLDataTypeCode::kDLUInt: switch (dtype.bits) { @@ -208,7 +282,11 @@ ScalarType toScalarType(const DLDataType& dtype) { stype = ScalarType::UInt64; break; default: +<<<<<<< HEAD TORCH_CHECK_BUFFER( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) false, "Unsupported kUInt bits ", std::to_string(dtype.bits)); } break; @@ -227,7 +305,11 @@ ScalarType toScalarType(const DLDataType& dtype) { stype = ScalarType::Long; break; default: +<<<<<<< HEAD TORCH_CHECK_BUFFER( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) false, "Unsupported kInt bits ", std::to_string(dtype.bits)); } break; @@ -243,7 +325,11 @@ ScalarType toScalarType(const DLDataType& dtype) { stype = ScalarType::Double; break; default: +<<<<<<< HEAD TORCH_CHECK_BUFFER( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) false, "Unsupported kFloat bits ", std::to_string(dtype.bits)); } break; @@ -253,7 +339,11 @@ ScalarType toScalarType(const DLDataType& dtype) { stype = ScalarType::BFloat16; break; default: +<<<<<<< HEAD TORCH_CHECK_BUFFER( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) false, "Unsupported kFloat bits ", std::to_string(dtype.bits)); } break; @@ -269,7 +359,11 @@ ScalarType toScalarType(const DLDataType& dtype) { stype = ScalarType::ComplexDouble; break; default: +<<<<<<< HEAD TORCH_CHECK_BUFFER( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) false, "Unsupported kFloat bits ", std::to_string(dtype.bits)); } break; @@ -279,6 +373,7 @@ ScalarType toScalarType(const DLDataType& dtype) { stype = ScalarType::Bool; break; default: +<<<<<<< HEAD TORCH_CHECK_BUFFER( false, "Unsupported kDLBool bits ", std::to_string(dtype.bits)); } @@ -352,11 +447,20 @@ ScalarType toScalarType(const DLDataType& dtype) { break; default: TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code)); +======= + TORCH_CHECK( + false, "Unsupported kDLBool bits ", std::to_string(dtype.bits)); + } + break; + default: + TORCH_CHECK(false, "Unsupported code ", std::to_string(dtype.code)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return stype; } namespace { +<<<<<<< HEAD // The templated classes below are needed for supporting both: // - DLManagedTensor @@ -383,10 +487,21 @@ void fillVersion( tensor->flags = 0; tensor->version.major = DLPACK_MAJOR_VERSION; tensor->version.minor = DLPACK_MINOR_VERSION; +======= +struct ATenDLMTensor { + Tensor handle; + DLManagedTensor tensor{}; +}; +} // namespace + +static void deleter(DLManagedTensor* arg) { + delete static_cast(arg->manager_ctx); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // This function returns a shared_ptr to memory managed DLpack tensor // constructed out of ATen tensor +<<<<<<< HEAD template T* toDLPackImpl(const Tensor& src) { ATenDLMTensor* atDLMTensor(new ATenDLMTensor); @@ -429,18 +544,73 @@ at::Tensor fromDLPackImpl(T* src, std::function deleter) { return at::from_blob( dl_tensor.data, IntArrayRef(dl_tensor.shape, dl_tensor.ndim), +======= +DLManagedTensor* toDLPack(const Tensor& src) { + // create a new tensor with possibly normalized strides + // gh-83069 + auto shape = src.sizes(); + auto strides = src.strides().vec(); + for (int i = 0; i < src.dim(); i++) { + if (shape[i] < 2) { + strides[i] = 1; + } + } + + auto view = src.as_strided(shape, strides, src.storage_offset()); + ATenDLMTensor* atDLMTensor(new ATenDLMTensor); + atDLMTensor->handle = view; + atDLMTensor->tensor.manager_ctx = atDLMTensor; + atDLMTensor->tensor.deleter = &deleter; + atDLMTensor->tensor.dl_tensor.data = view.data_ptr(); + c10::DeviceIndex device_id = 0; + if (src.is_cuda() || src.is_privateuseone()) { + device_id = src.get_device(); + } + atDLMTensor->tensor.dl_tensor.device = getDLDevice(src, device_id); + atDLMTensor->tensor.dl_tensor.ndim = static_cast(src.dim()); + atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); + atDLMTensor->tensor.dl_tensor.shape = view.sizes().data(); + atDLMTensor->tensor.dl_tensor.strides = view.strides().data(); + atDLMTensor->tensor.dl_tensor.byte_offset = 0; + return &(atDLMTensor->tensor); +} + +Tensor fromDLPack(DLManagedTensor* src) { + auto deleter = [src](void* self [[maybe_unused]]) { + if (src->deleter) { + src->deleter(src); + } + }; + return fromDLPack(src, std::move(deleter)); +} + +Tensor fromDLPack(DLManagedTensor* src, std::function deleter) { + Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data); + ScalarType stype = toScalarType(src->dl_tensor.dtype); + if (!src->dl_tensor.strides) { + return at::from_blob( + src->dl_tensor.data, + IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::move(deleter), at::device(device).dtype(stype), {device}); } return at::from_blob( +<<<<<<< HEAD dl_tensor.data, IntArrayRef(dl_tensor.shape, dl_tensor.ndim), IntArrayRef(dl_tensor.strides, dl_tensor.ndim), +======= + src->dl_tensor.data, + IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim), + IntArrayRef(src->dl_tensor.strides, src->dl_tensor.ndim), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) deleter, at::device(device).dtype(stype), {device}); } +<<<<<<< HEAD // Explicitly instantiate the template above for both classes. template at::Tensor fromDLPackImpl(DLManagedTensor* src, std::function deleter); @@ -495,4 +665,6 @@ Tensor maybeCopyTensor( return data; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h index 928731fafb2f6..f28623e744728 100644 --- a/aten/src/ATen/DLConvertor.h +++ b/aten/src/ATen/DLConvertor.h @@ -4,7 +4,11 @@ #include #include +<<<<<<< HEAD // this converter will: +======= +// this convertor will: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // 1) take a Tensor object and wrap it in the DLPack tensor // 2) take a dlpack tensor and convert it to the ATen Tensor @@ -12,6 +16,7 @@ namespace at { TORCH_API ScalarType toScalarType(const DLDataType& dtype); TORCH_API DLManagedTensor* toDLPack(const Tensor& src); +<<<<<<< HEAD TORCH_API struct DLManagedTensorVersioned* toDLPackVersioned(const Tensor& src); TORCH_API Tensor fromDLPack(DLManagedTensor* src, std::function deleter = {}); @@ -66,4 +71,12 @@ struct DLPackTraits { inline static auto fromDLPack = at::fromDLPackVersioned; }; +======= +TORCH_API Tensor fromDLPack(DLManagedTensor* src); +TORCH_API Tensor +fromDLPack(DLManagedTensor* src, std::function deleter); +TORCH_API DLDataType getDLDataType(const Tensor& t); +TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h index f23b35047fcc8..0f01d3278929a 100644 --- a/aten/src/ATen/DeviceAccelerator.h +++ b/aten/src/ATen/DeviceAccelerator.h @@ -1,6 +1,9 @@ #pragma once +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -31,7 +34,11 @@ TORCH_API bool isAccelerator(c10::DeviceType device_type); template < typename... T, typename = std::enable_if_t<(std::is_same_v && ...)>> +<<<<<<< HEAD inline bool isAcceleratorExcluded( +======= +TORCH_API inline bool isAcceleratorExcluded( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::DeviceType device_type, c10::DeviceType first_excluded, T... rest_excluded) { @@ -73,6 +80,7 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index); // original device index that was active before the change. TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index); +<<<<<<< HEAD TORCH_API inline void emptyCache() { const auto device_type = getAccelerator(true).value(); at::getDeviceAllocator(device_type)->emptyCache(); @@ -94,6 +102,8 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) { at::getDeviceAllocator(device_type)->resetPeakStats(device_index); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::accelerator namespace at { diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 0e535ab20cd21..e6faaaf1e7f12 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -31,9 +31,13 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { return at::globalContext().getPinnedMemoryAllocator(opt_device_type); } else { TORCH_CHECK( +<<<<<<< HEAD false, "pin_memory=True requires a CUDA or other accelerator backend; " "no pinned memory allocator is available on this system.") +======= + false, "Need to provide pin_memory allocator to use pin memory.") +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index ac76d09537fa2..952a85bced0d6 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -16,8 +16,13 @@ inline void check_size_nonnegative(ArrayRef size) { inline void check_size_nonnegative(ArrayRef size) { for (const auto& x : size) { +<<<<<<< HEAD TORCH_SYM_CHECK( x.sym_ge(0), +======= + TORCH_CHECK( + x.expect_size(__FILE__, __LINE__), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "Trying to create tensor with negative dimension ", x, ": ", diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 1bf46ebe61b61..2ab18ed01ab07 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -468,7 +468,11 @@ inline Tensor _sum_to( // if we assume no reduction due to unbacked we ensure that at runtime. TORCH_MAYBE_SYM_CHECK( sym_eq(shape[i - leading_dims], sizes[i]), +<<<<<<< HEAD "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:", +======= + "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shape[i - leading_dims], ", ", sizes[i]) diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp index 123d87b304148..1af022b0411ae 100644 --- a/aten/src/ATen/FunctionalInverses.cpp +++ b/aten/src/ATen/FunctionalInverses.cpp @@ -233,8 +233,13 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor // NOLINTNEXTLINE(performance-unnecessary-value-param) Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) { +<<<<<<< HEAD // It would be nice if this logic could be reused from autograd's split_backward(), but I don't think it can. // For functionalization, we have only have one of the tensors from the TensorList outputted by split(), and we want to layer i +======= + // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can. + // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // on top of the base tensor. // For autograd, we have all of the tensors outputted by split() and we just want to stack them. dim = at::maybe_wrap_dim(dim, base.dim()); diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp index 9631872875c69..c4ffe05866473 100644 --- a/aten/src/ATen/FunctionalStorageImpl.cpp +++ b/aten/src/ATen/FunctionalStorageImpl.cpp @@ -9,6 +9,14 @@ namespace at::functionalization { +<<<<<<< HEAD +======= +ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { + if (out_idx == this->out_index) return *this; + return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [Functionalization: Alias Removal Part 2] // See Note [Functionalization: Alias Removal] for more details. // This function applies a single update from one of the views to the StorageImpl. @@ -37,12 +45,20 @@ namespace at::functionalization { static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { at::Tensor t = update.new_val; TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); +<<<<<<< HEAD if (update.view_metas.empty()) { return t; } +======= + if (update.view_metas.empty()) return t; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::vector tmp_values({base}); tmp_values.reserve(update.view_metas.size()); for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { +<<<<<<< HEAD at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back()); +======= + at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided // All of these ops require additional information to recover the sizes of the original tensor. // If need to, we could probably apply this optimization and only bother computing tmp_values @@ -50,8 +66,14 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co tmp_values.push_back(std::move(next_view)); } for(int64_t i = static_cast(update.view_metas.size()) - 1; i >= 0; --i) { +<<<<<<< HEAD // Each view inverse is implemented in ViewInverses.cpp. t = update.view_metas[i]->reverse(tmp_values[i], t); +======= + int64_t out_idx = update.view_metas[i].out_index; + // Each view inverse is implemented in ViewInverses.cpp. + t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); return t; @@ -96,7 +118,11 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) // SparseTensorImpl has no storage, so we cannot query its nbytes. // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse) // Same for XLA +<<<<<<< HEAD if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) { +======= + if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes(); } else { original_storage_size_ = -1; @@ -105,13 +131,21 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); } +<<<<<<< HEAD void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector>& metas) { +======= +void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector& metas) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); if (metas.size() > 1) { for (size_t i = 1; i < metas.size(); ++i) { // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI +<<<<<<< HEAD TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided, +======= + TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h index 0c9c1fd775f32..9fff2c6e0b677 100644 --- a/aten/src/ATen/FunctionalStorageImpl.h +++ b/aten/src/ATen/FunctionalStorageImpl.h @@ -8,6 +8,7 @@ namespace at::functionalization { // See Note [Functionalization Pass In Core] +<<<<<<< HEAD enum class InverseReturnMode { /// Specifies that functional inverses should always return a view. AlwaysView, @@ -77,20 +78,58 @@ enum class InverseReturnMode { // a type are used for supporting pickle serialization. struct ViewMeta { ViewMeta( +======= +// ViewMeta is a class used by the functionalization pass to navigate between +// a base tensor and a view tensor. +// For example, if I call `b = a.view1(...)` +// the functionalization pass will generate and store a ViewMeta on b that looks +// like: +// +// ViewMeta( +// [](const Tensor& base, int64_t mutated_view_idx) { +// return base.view1(...); +// }, +// [](const at::Tensor& base, const at::Tensor& mutated_view, +// int64_t mutated_view_idx) -> at::Tensor { +// return at::functionalization::impl::view1_inverse(base, mutated_view, +// ...); +// } +// +// The forward_fn lambda describes how to replay view1 on a tensor. +// +// The reverse_fn lambda describes how, given a tensor that is already a view, +// how to get the corresponding base tensor. See Note [Functionalization Pass: +// View Inverses] for details. +struct ViewMeta { + ViewMeta( + std::function forward, + std::function reverse, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool has_symbolic_inputs, bool is_multi_output = false, bool is_as_strided = false, int64_t out_idx = 0) +<<<<<<< HEAD : out_index(out_idx), +======= + : forward_fn(std::move(forward)), + reverse_fn(std::move(reverse)), + out_index(out_idx), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) is_multi_output(is_multi_output), is_as_strided(is_as_strided), has_symbolic_inputs(has_symbolic_inputs) {} +<<<<<<< HEAD virtual ~ViewMeta() = default; virtual Tensor forward(const Tensor& base) = 0; virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0; +======= + std::function forward_fn; + std::function reverse_fn; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See Note [out_idx in ViewMeta] int64_t out_index; @@ -102,6 +141,7 @@ struct ViewMeta { // Tells us if this view operation has any symbolic inputs bool has_symbolic_inputs; +<<<<<<< HEAD // Returns a new ViewMeta with the same forward/reverse // functions, but a new out index. // @@ -113,6 +153,12 @@ struct ViewMeta { "ViewMeta::to_out_index not implemented. ", "Likely because there's only one output."); } +======= + // Returns a copy of the current ViewMeta, if out_idx matches the current + // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse + // functions, but a new out index. + ViewMeta to_out_idx(int64_t out_idx); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; // FunctionalStorageImpl is a subclass of StorageImpl used by the @@ -145,14 +191,22 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const at::Tensor new_val; // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) +<<<<<<< HEAD const std::vector> view_metas; +======= + const std::vector view_metas; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; explicit FunctionalStorageImpl(const Tensor& value); void add_update( const Tensor& updated_val, +<<<<<<< HEAD const std::vector>& view_metas); +======= + const std::vector& view_metas); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool apply_updates(); const Tensor& base() { return base_; @@ -174,9 +228,12 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { ~FunctionalStorageImpl() override = default; +<<<<<<< HEAD uint64_t mutation_counter() { return mutation_counter_; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void mark_mutation() { mutation_counter_++; } @@ -205,17 +262,23 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { void mark_inductor_storage_resize(c10::SymInt new_size) { inductor_storage_resized_ = true; curr_storage_size_ = std::move(new_size); +<<<<<<< HEAD inductor_storage_resized_counter_++; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool was_inductor_storage_resized() { return inductor_storage_resized_; } +<<<<<<< HEAD uint64_t inductor_storage_resized_counter() { return inductor_storage_resized_counter_; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) private: // NB: base_ should always point to a tensor BELOW the current // functionalization layer. This is mainly to avoid reference cycles. e.g. @@ -261,7 +324,10 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { // (1) There were any storage resizes on a graph input // (2) The original/curr storage size tell us if these resizes result in a nop bool inductor_storage_resized_ = false; +<<<<<<< HEAD uint64_t inductor_storage_resized_counter_ = 0; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::SymInt original_storage_size_; c10::SymInt curr_storage_size_; }; diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 8b7b3bc42a9cb..cfe3e5867b10d 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -122,13 +122,18 @@ void FunctionalTensorWrapper::freeze_storage() const { // | have their own storages, but backends like functorch | // \/ are allowed to re-alias underneath the pass \/ // . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - . +<<<<<<< HEAD // | underlying_storage | | underlying_storage | +======= +// | underyling_storage | | underyling_storage | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - . // // This constructor is only used by view ops. // - view_value: The output tensor that we need to wrap. // - base: The "base" of the view that `view_value` was generated from. // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. +<<<<<<< HEAD FunctionalTensorWrapper::FunctionalTensorWrapper( const Tensor& view_value, const FunctionalTensorWrapper* base, @@ -142,6 +147,19 @@ FunctionalTensorWrapper::FunctionalTensorWrapper( base->is_multi_output_view_ || meta->is_multi_output), was_storage_changed_(base->was_storage_changed_), is_symbolic_(base->is_symbolic_) { +======= +FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta) + : c10::TensorImpl( + c10::DispatchKeySet(DispatchKey::Functionalize), + view_value.dtype(), + view_value.device() + ), + value_(view_value), + is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output), + was_storage_changed_(base->was_storage_changed_), + is_symbolic_(base->is_symbolic_) +{ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); set_constructor_metadata(); @@ -150,10 +168,18 @@ FunctionalTensorWrapper::FunctionalTensorWrapper( view_metas_ = base->view_metas_; // copy } view_metas_.push_back(meta); +<<<<<<< HEAD maybe_mark_symbolic(meta.get()); storage_ = base->storage_; // alias this tensor's storage with the base tensor's } +======= + maybe_mark_symbolic(meta); + storage_ = base->storage_; // alias this tensor's storage with the base tensor's +} + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { return static_cast(storage_.unsafeGetStorageImpl()); } @@ -177,18 +203,31 @@ bool FunctionalTensorWrapper::is_up_to_date() const { } // See Note [Functionalization Pass - Inplace View Ops] +<<<<<<< HEAD void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr& meta) { view_metas_.push_back(meta); // Manually track the fact that this tensor received a metadata mutation! has_metadata_mutation_ = true; // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. maybe_mark_symbolic(meta.get()); +======= +void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) { + view_metas_.push_back(meta); + // Manually track the fact that this tensor recieved a metadata mutation! + has_metadata_mutation_ = true; + // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. + maybe_mark_symbolic(meta); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [Functionalization Pass - Inplace View Ops] // So, these ops are special - they're mutation AND view ops. They get special codegen. // An example is transpose_, e.g. `a.transpose_()` // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. at::AutoDispatchSkipFunctionalize guard; +<<<<<<< HEAD value_ = meta->forward(value_); +======= + value_ = meta.forward_fn(value_, meta.out_index); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); } @@ -274,7 +313,11 @@ void FunctionalTensorWrapper::set__impl(const FunctionalTensorWrapper* other) { // (We could check if the updated value has a new storage than the original value, // but this won't also let us uniquely determine if the tensor **also** // experienced a data mutation). +<<<<<<< HEAD mark_storage_changed(); +======= + was_storage_changed_ = true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto sizes_ = value_.sym_sizes(); auto strides_ = value_.sym_strides(); @@ -287,11 +330,19 @@ void FunctionalTensorWrapper::storage_resize_(const c10::SymInt& new_size) { // storage resizing is severely limited: we only support resizing either to zero, or from zero bytes. TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size); // The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want +<<<<<<< HEAD // resize_() calls to actually emit any ops in the functional graph. // How does it work? // Resizing up (old size == 0): // We do nothing in this case. // The expectation is that for the user code to be valid, the next op that should run against the current tensor "x" +======= + // resize_() calls to actualy emit any ops in the functional graph. + // How does it work? + // Resizing up (old size == 0): + // We do nothing in this case. + // The expection is that for the user code to be valid, the next op that should run against the current tensor "x" +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // will be a x.copy_(y) (or similar), that will fully overwrite the data of x. // If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call // (otherwise the eager code would be invalid), @@ -328,7 +379,11 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) { // We're also no longer re-generate "b" fully from "a" anymore, since "a" refers to a slice of "b"'s data. // // This is probably fixable in theory, but: +<<<<<<< HEAD // - the fix would likely complicated the functionalization logic quite a bit. +======= + // - the fix would likey complicated the functionalization logic quite a bit. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // - the primary use case for resize_() today is resizing zero-sized tensors in out= variants of operators // - resize_() also can give you weird results today if you try to resize_() a weirdly strided tensor. // @@ -345,7 +400,11 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) { set_sizes_and_strides(value_.sizes(), value_.strides()); refresh_numel(); // (Technically we should be guaranteed that the tensor was already contiguous, +<<<<<<< HEAD // since it's guaranteed not to have been a view. Doesn't hurt to run though) +======= + // since it's guaranteed not to have been a view. Doesnt hurt to run though) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) refresh_contiguous(); // Swapping out the storage of a tensor (aka from a resize_() call) will update the sizes and strides of the tensor, // so we need to record the fact that metadata was mutated. @@ -369,8 +428,20 @@ void FunctionalTensorWrapper::sync_() { regenerate_from_base(); } +<<<<<<< HEAD const std::vector>& FunctionalTensorWrapper::view_metas() const { return view_metas_; +======= +Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) { + auto t = base; + + // Reapply views to get the viewed tensor from the base in alias_ + for (auto& view_meta: view_metas_) { + t = view_meta.forward_fn(t, view_meta.out_index); + } + + return t; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void FunctionalTensorWrapper::regenerate_from_base() { @@ -379,7 +450,11 @@ void FunctionalTensorWrapper::regenerate_from_base() { auto t = storage_impl->base(); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); +<<<<<<< HEAD t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_); +======= + t = apply_view_metas(t); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); replace_(t, /*from_lazy_regenerate=*/true); @@ -479,10 +554,14 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptrdevice(). return storage().data_ptr().device(); +======= + return value_.unsafeGetTensorImpl()->device(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const { return value_.unsafeGetTensorImpl()->sizes(); @@ -496,8 +575,13 @@ int64_t FunctionalTensorWrapper::dim_custom() const { int64_t FunctionalTensorWrapper::numel_custom() const { return value_.unsafeGetTensorImpl()->numel(); } +<<<<<<< HEAD c10::SymBool FunctionalTensorWrapper::sym_is_contiguous_custom(at::MemoryFormat memory_format) const { return value_.unsafeGetTensorImpl()->sym_is_contiguous(memory_format); +======= +bool FunctionalTensorWrapper::is_contiguous_custom(at::MemoryFormat memory_format) const { + return value_.unsafeGetTensorImpl()->is_contiguous(memory_format); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } c10::SymIntArrayRef FunctionalTensorWrapper::sym_sizes_custom() const { return value_.unsafeGetTensorImpl()->sym_sizes(); @@ -576,7 +660,11 @@ std::vector from_functional_tensor(ITensorListRef t_list) { for (const auto& tensor : t_list) { // from_functional_tensor(Tensor) has asserts to make sure you don't accidentally call // it on a non-functional input, +<<<<<<< HEAD // but from_functional_tensor(TensorList) can receive a list containing both +======= + // but from_functional_tensor(TensorList) can recieve a list containing both +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // functional and non-functional tensors. // Example of when that can happen: torch.cat(function_input_tensor, global_state_tensor). // When that happens, we're okay with only unwrapping the functional tensors. @@ -721,11 +809,19 @@ bool isFunctionalTensor(const std::optional& t) { } bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { +<<<<<<< HEAD if (t_list.empty()) { return false; } auto functional_count = 0; for (const auto i : c10::irange(t_list.size())) { auto const & e= t_list[i]; if (!e.has_value() || !e->defined()) { continue; } +======= + if (t_list.empty()) return false; + auto functional_count = 0; + for (const auto i : c10::irange(t_list.size())) { + auto const & e= t_list[i]; + if (!e.has_value() || !e->defined()) continue; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (isFunctionalTensor(e)) { ++functional_count; } @@ -735,10 +831,17 @@ bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { template static bool isFunctionalTensorIListRef(c10::IListRef list) { +<<<<<<< HEAD if (list.size() == 0) { return false; } auto functional_count = 0; for (const auto& tensor : list) { if (!tensor.defined()) { continue; } +======= + if (list.size() == 0) return false; + auto functional_count = 0; + for (const auto& tensor : list) { + if (!tensor.defined()) continue; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (isFunctionalTensor(tensor)) { ++functional_count; } @@ -756,6 +859,7 @@ void freeze_functional_tensor(const Tensor& tensor) { functional_base_impl->freeze_storage(); } +<<<<<<< HEAD Tensor create_functional_tensor_with_view_meta( const at::Tensor& view_to_wrap, const at::Tensor& base, @@ -765,10 +869,17 @@ Tensor create_functional_tensor_with_view_meta( TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); auto meta_ = meta; +======= +Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) { + TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); + TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); + auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (out_idx != 0) { // Note [out_idx in ViewMeta] // When a view op outputs multiple tensors, each output needs its own separate ViewMeta. // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. +<<<<<<< HEAD meta_ = meta->to_out_index(out_idx); } return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta_); @@ -778,6 +889,14 @@ std::vector create_functional_tensor_with_view_meta( ITensorListRef view_to_wrap, const at::Tensor& base, const std::shared_ptr& meta) { +======= + meta = meta.to_out_idx(out_idx); + } + return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta); +} + +std::vector create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::vector outputs(view_to_wrap.size()); int64_t i = 0; for (const auto& tensor : view_to_wrap) { @@ -787,12 +906,17 @@ std::vector create_functional_tensor_with_view_meta( return outputs; } +<<<<<<< HEAD void mutate_view_meta(const at::Tensor& self, const std::shared_ptr& meta) { +======= +void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); self_impl->mutate_view_meta(meta); } +<<<<<<< HEAD Tensor apply_view_meta_sequence( const Tensor& base, const std::vector>& sequence) { @@ -803,6 +927,8 @@ Tensor apply_view_meta_sequence( return r; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [Propagating strides in the functionalization pass] // In order to properly compute stride information, the functionalization pass // calls each {view} reference implementations with meta tensors. @@ -834,7 +960,11 @@ void setFunctionalizationReapplyViewsTLS(bool reapply_views) { // This function will "functionalize" it. // That is, it will call the operator, but removing any intermediate views/mutations // that are performed inside of it. +<<<<<<< HEAD // This is useful for LTC/XLA, which would like to reuse some of our composite kernels +======= +// This is useful for LTC/XLA, which would like to re-use some of our composite kernels +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // from pytorch core but not have to worry about the view ops that they might call. // e.g. at::block_diag void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* stack) { @@ -896,7 +1026,11 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s const auto& ivalue = returns[idx]; if (ivalue.isTensor()) { const auto& t = ivalue.toTensor(); +<<<<<<< HEAD if (!t.defined()) { continue; } +======= + if (!t.defined()) continue; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::functionalization::impl::sync(t); auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h index 6d9050728da70..3beade67d507a 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.h +++ b/aten/src/ATen/FunctionalTensorWrapper.h @@ -56,7 +56,11 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { explicit FunctionalTensorWrapper( const Tensor& view_value, const FunctionalTensorWrapper* base, +<<<<<<< HEAD const std::shared_ptr& meta); +======= + const functionalization::ViewMeta& meta); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Get the underlying, actual tensor, that doesn't know anything about // functionalization. @@ -74,9 +78,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { bool has_metadata_mutation() const { return has_metadata_mutation_; } +<<<<<<< HEAD uint64_t mutation_counter() const { return functional_storage_impl()->mutation_counter(); } +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void mark_mutation() { functional_storage_impl()->mark_mutation(); } @@ -99,17 +107,28 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { ->are_all_mutations_under_no_grad_or_inference_mode(); } +<<<<<<< HEAD void maybe_mark_symbolic(functionalization::ViewMeta* meta) { is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs; +======= + void maybe_mark_symbolic(const functionalization::ViewMeta& meta) { + is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool is_symbolic() const { return is_symbolic_; } +<<<<<<< HEAD // Retrieves the ViewMeta sequence of this tensor. const std::vector>& view_metas() const; +======= + // Runs the forward_fn of every ViewMeta collected in the current instance + // to some other base. + Tensor apply_view_metas(const Tensor& base); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Sync's the underlying tensor with its alias, if it's out of date. This // involves two steps: 1) Apply any pending updates/mutations to the alias 2) @@ -146,8 +165,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { // from the base tensor. This method is used by inplace-view ops like // transpose_. It appends a ViewMeta to the existing stack, and refreshes the // tensor by replaying the views off of the alias. +<<<<<<< HEAD void mutate_view_meta( const std::shared_ptr& meta); +======= + void mutate_view_meta(const at::functionalization::ViewMeta& meta); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Custom implementation of self.set_(src) void set__impl(const FunctionalTensorWrapper* other); @@ -164,6 +187,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { return was_storage_changed_; } +<<<<<<< HEAD void mark_storage_changed() { was_storage_changed_ = true; storage_changed_counter_++; @@ -171,6 +195,10 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { uint64_t storage_changed_counter() { return storage_changed_counter_; +======= + void set_storage_changed() { + was_storage_changed_ = true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // A FunctionalTensor is considered a base if its not a view of another @@ -189,9 +217,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { return functional_storage_impl()->was_inductor_storage_resized(); } +<<<<<<< HEAD bool inductor_storage_resized_counter() { return functional_storage_impl()->inductor_storage_resized_counter(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // The functionalization pass can be used to remove mutations. // It does so by replacing any mutation op with it's corresponding // out-of-place op, followed by a call to replace_(). e.g: @@ -237,8 +268,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { at::IntArrayRef strides_custom() const override; int64_t dim_custom() const override; int64_t numel_custom() const override; +<<<<<<< HEAD c10::SymBool sym_is_contiguous_custom( at::MemoryFormat memory_format) const override; +======= + bool is_contiguous_custom(at::MemoryFormat memory_format) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::SymIntArrayRef sym_sizes_custom() const override; c10::SymInt sym_size_custom(int64_t d) const override; c10::SymIntArrayRef sym_strides_custom() const override; @@ -281,12 +316,19 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { bool is_multi_output_view_ = false; // Did the tensor experience a set_() call. bool was_storage_changed_ = false; +<<<<<<< HEAD uint64_t storage_changed_counter_ = 0; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Did the tensor experience any view operation with symbolic int. bool is_symbolic_ = false; size_t generation_ = 0; +<<<<<<< HEAD std::vector> view_metas_; +======= + std::vector view_metas_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) protected: static void copy_tensor_metadata( @@ -301,7 +343,11 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { namespace functionalization { namespace impl { +<<<<<<< HEAD inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper( +======= +TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& tensor) { auto functional_impl = static_cast(tensor.unsafeGetTensorImpl()); @@ -378,11 +424,16 @@ TORCH_API void propagate_xla_data_direct( Tensor create_functional_tensor_with_view_meta( const Tensor& view_to_wrap, const Tensor& base, +<<<<<<< HEAD const std::shared_ptr& meta, +======= + functionalization::ViewMeta meta, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t out_idx = 0); std::vector create_functional_tensor_with_view_meta( ITensorListRef view_to_wrap, const Tensor& base, +<<<<<<< HEAD const std::shared_ptr& meta); void mutate_view_meta( @@ -392,6 +443,13 @@ void mutate_view_meta( TORCH_API Tensor apply_view_meta_sequence( const Tensor& base, const std::vector>& sequence); +======= + const functionalization::ViewMeta& meta); + +void mutate_view_meta( + const Tensor& self, + const functionalization::ViewMeta& meta); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset( diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index 10f988b4d2815..51f1e43a68498 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -1,5 +1,8 @@ +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -9,6 +12,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -29,6 +36,7 @@ #include #endif +<<<<<<< HEAD namespace at::functionalization { Tensor resize__ViewMeta::forward(const Tensor& base) { @@ -54,6 +62,8 @@ Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_ } // namespace at::functionalization +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { const auto& schema = op.schema(); @@ -132,9 +142,13 @@ namespace { const auto& ivalue = returns[idx]; if (ivalue.isTensor() && should_wrap_outputs) { const auto& t = ivalue.toTensor(); +<<<<<<< HEAD if (!t.defined()) { continue; } +======= + if (!t.defined()) continue; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; } else if (ivalue.isTensorList() && should_wrap_outputs) { @@ -197,8 +211,24 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch // The output of resizing is equivalent to taking a slice of a larger tensor. // We have to emulate this "slicing" with an as_strided call. auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); +<<<<<<< HEAD auto view_meta = std::make_shared( reapply_views, size.vec()); +======= + at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( + [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { + if (reapply_views) { + return base.as_strided(size, c10::contiguous_strides(size)); + } else { + return at::as_strided_copy(base, size, c10::contiguous_strides(size)); + } + }, + [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { + return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size)); + }, + /*has_symbolic_inputs=*/false + ); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::functionalization::impl::mutate_view_meta(self, view_meta); return self; } @@ -317,11 +347,25 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt tmp_output = at::_unsafe_view_symint(self_, size); } +<<<<<<< HEAD bool has_symbolic_inputs = std::any_of( size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); auto view_meta = std::make_shared( has_symbolic_inputs, size.vec()); +======= + bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); + + at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( + [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { + return at::_unsafe_view_symint(base, size); + }, + [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { + return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); + }, + /*has_symbolic_inputs=*/has_symbolic_inputs + ); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); // See Note [Propagating strides in the functionalization pass] @@ -331,9 +375,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size); if (!stride.has_value()) { +<<<<<<< HEAD TORCH_SYM_CHECK( self.sym_is_contiguous(), +======= + // With unbacked symints, computeStride could fail even on contiguous + // tensors. In this case, we can use the strides of an empty tensor of + // inferred_size. + TORCH_CHECK( + self.is_contiguous(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "View is not valid from size:", self.sym_sizes(), " stride: ", @@ -342,9 +394,12 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt inferred_size, " in case of unbacked symbols consider adding torch.check to guide computing strides."); +<<<<<<< HEAD // With unbacked symints, computeStride could fail even on contiguous // tensors. In this case, we can use the strides of an empty tensor of // inferred_size. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) stride = at::detail::empty_symint_meta( inferred_size, std::nullopt, diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h index 817bf0ddba0b8..29c1596b0b2e3 100644 --- a/aten/src/ATen/InferSize.h +++ b/aten/src/ATen/InferSize.h @@ -4,7 +4,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -27,7 +30,13 @@ inline void infer_size_impl( std::optional infer_dim; for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) { +<<<<<<< HEAD TORCH_CHECK(!infer_dim, "only one dimension can be inferred"); +======= + if (infer_dim) { + throw std::runtime_error("only one dimension can be inferred"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) infer_dim = dim; } else { // in case of unbacked shape[dim] we assume it's not -1 and add a runtime @@ -44,6 +53,7 @@ inline void infer_size_impl( } } +<<<<<<< HEAD if (infer_dim) { // numel is the product of known sizes, it has to be divisible by newsize. // and newsize should be positive unless newsize == numel (we throw @@ -77,6 +87,9 @@ inline void infer_size_impl( numel); } +======= + auto set_infer_dim = [&]() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We have a degree of freedom here to select the dimension size; follow // NumPy semantics and just bail. However, a nice error message is needed // because users often use `view` as a way to flatten & unflatten @@ -85,15 +98,29 @@ inline void infer_size_impl( // works yet // empty_tensor.view(-1, 0) // doesn't. +<<<<<<< HEAD TORCH_MAYBE_SYM_CHECK( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape, " because the unspecified dimension size -1 can be any " "value and is ambiguous"); +<<<<<<< HEAD res[*infer_dim] = numel / newsize; return; +======= + res[*infer_dim] = numel / newsize; + return; + }; + + if (infer_dim && newsize > 0 && numel % newsize == 0) { + set_infer_dim(); + return; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } TORCH_MAYBE_SYM_CHECK( @@ -102,6 +129,12 @@ inline void infer_size_impl( shape, "' is invalid for input of size ", numel); +<<<<<<< HEAD +======= + if (infer_dim) { + set_infer_dim(); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline std::vector infer_size(IntArrayRef shape, int64_t numel) { diff --git a/aten/src/ATen/LegacyBatchedFallback.cpp b/aten/src/ATen/LegacyBatchedFallback.cpp index f2b527302a97b..f49559e21f97f 100644 --- a/aten/src/ATen/LegacyBatchedFallback.cpp +++ b/aten/src/ATen/LegacyBatchedFallback.cpp @@ -218,7 +218,11 @@ static Tensor safeStack(TensorList tensors) { // is possible for the backward function to return an undefined grad for some // grad_input for each example. In that case, we return an undefined grad. // +<<<<<<< HEAD // It is theoretically possible for *some* of the examples to produce an +======= + // It is theoretically posssible for *some* of the examples to produce an +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // undefined grad (a kernel could peek at the gradient values and return an // undefined tensor if it determines the gradient is full of zeros). We // could handle this by treating the undefined grad as a zero-filled tensor diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.cpp b/aten/src/ATen/LegacyBatchedTensorImpl.cpp index cceefe985a7e2..d944682a2e8e2 100644 --- a/aten/src/ATen/LegacyBatchedTensorImpl.cpp +++ b/aten/src/ATen/LegacyBatchedTensorImpl.cpp @@ -84,7 +84,11 @@ IntArrayRef BatchedTensorImpl::strides_custom() const { // TODO: implement proper contiguity on batched tensor, then put // sizes_strides_policy back to Default +<<<<<<< HEAD c10::SymBool BatchedTensorImpl::sym_is_contiguous_custom(at::MemoryFormat memory_format) const { +======= +bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(memory_format == MemoryFormat::Contiguous, "NYI: querying is_contiguous inside of vmap for memory_format ", "other than torch.contiguous_format"); diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h index 798e3535af3fb..22d2400b26a9e 100644 --- a/aten/src/ATen/LegacyBatchedTensorImpl.h +++ b/aten/src/ATen/LegacyBatchedTensorImpl.h @@ -82,8 +82,12 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl { IntArrayRef strides_custom() const override; // Override a bunch of methods inherited from TensorImpl to return error // messages. +<<<<<<< HEAD c10::SymBool sym_is_contiguous_custom( at::MemoryFormat memory_format) const override; +======= + bool is_contiguous_custom(at::MemoryFormat memory_format) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void set_size(int64_t dim, int64_t new_size) override; void set_stride(int64_t dim, int64_t new_stride) override; void set_storage_offset(int64_t storage_offset) override; diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp index 2c54718e938fb..4b48afc5389fe 100644 --- a/aten/src/ATen/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp @@ -58,7 +58,11 @@ namespace at { namespace{ // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor. +<<<<<<< HEAD bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +======= +static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return dim == 0 || dim == -1; } @@ -365,7 +369,11 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) { return self_physical.getPhysicalToLogicalMap().apply(result); } +<<<<<<< HEAD int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) { +======= +static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return maybe_wrap_dim(dim, static_cast(input_sizes.size())) + num_batch_dims; } @@ -488,7 +496,11 @@ Tensor view_as_complex_batching_rule(const Tensor& self) { // Checks that the smallest batch stride is greater than the largest example // stride. This is something we can support but we choose not to because it's // potentially error prone. +<<<<<<< HEAD void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) { +======= +static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto smallest_batch_stride = std::min_element( physical_strides.begin(), physical_strides.begin() + num_batch_dims); auto largest_example_stride = std::max_element( @@ -508,7 +520,11 @@ void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_bat // given (sizes, strides, storage_offset) returns the maximum location that // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors // with zero-size dims). +<<<<<<< HEAD std::optional maximum_indexable_location( +======= +static std::optional maximum_indexable_location( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) { auto result = native::storage_size_for(sizes, strides); if (result == 0) { @@ -521,7 +537,11 @@ std::optional maximum_indexable_location( // This checks that the range of possible memory locations accessible by // x.as_strided(sizes, strides, maybe_storage_offset) // are within the bounds of possible memory locations accessible by x. +<<<<<<< HEAD void checkBasicAsStridedValidForSlice( +======= +static void checkBasicAsStridedValidForSlice( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& physical_tensor, int64_t num_batch_dims, IntArrayRef sizes, diff --git a/aten/src/ATen/LegacyVmapTransforms.h b/aten/src/ATen/LegacyVmapTransforms.h index be6cf1b697a22..3ca5c09332f98 100644 --- a/aten/src/ATen/LegacyVmapTransforms.h +++ b/aten/src/ATen/LegacyVmapTransforms.h @@ -140,7 +140,11 @@ struct TORCH_API VmapPhysicalView { // mapping a physical tensor to a new logical tensor (BatchedTensor) VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const; +<<<<<<< HEAD // Maps a logical shape to a physical shape by prepending the batch +======= + // Maps a logical shape to a physical shape by pre-pending the batch +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // sizes to the logical shape. VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const; diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp index ed697c32b58a8..ac8060bdbd03b 100644 --- a/aten/src/ATen/MapAllocator.cpp +++ b/aten/src/ATen/MapAllocator.cpp @@ -62,7 +62,11 @@ constexpr const char* unknown_eventname = "eventname not specified"; #endif } // namespace (anonymous) +<<<<<<< HEAD MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size) +======= +MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : filename_(filename.empty() ? unknown_filename : filename) , size_(0) // to be filled later #ifdef _WIN32 @@ -292,6 +296,7 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, if (ftruncate(fd, static_cast(size)) == -1) { TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")"); } +<<<<<<< HEAD #ifdef HAVE_POSIX_FALLOCATE if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) { @@ -314,6 +319,8 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast(size)) { #ifndef STRIP_ERROR_MESSAGES int last_err = errno; @@ -321,7 +328,11 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, ::close(fd); TORCH_CHECK(false, "unable to stretch file <", filename_, "> to the right size: ", c10::utils::str_error(last_err), " (", last_err, ")"); } +<<<<<<< HEAD /* on macOS write returns with errno 45 (Operation not supported) when used +======= +/* on macOS write returns with errno 45 (Opperation not supported) when used +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * with a file descriptor obtained via shm_open */ #ifndef __APPLE__ @@ -494,7 +505,11 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags, initializeAlloc(); } +<<<<<<< HEAD RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size) +======= +RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : RefcountedMapAllocatorArgCheck(flags) , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) { @@ -614,7 +629,11 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU}; } +<<<<<<< HEAD at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { +======= +at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size); if (actual_size_out) *actual_size_out = context->size(); return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU}; @@ -626,7 +645,11 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags, return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU}; } +<<<<<<< HEAD at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { +======= +at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size); if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment; return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU}; diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h index 7a3415a4c4112..a9cf976b96cd3 100644 --- a/aten/src/ATen/MapAllocator.h +++ b/aten/src/ATen/MapAllocator.h @@ -25,7 +25,11 @@ class TORCH_API MapAllocator { public: MapAllocator(std::string_view filename, int flags, size_t size); MapAllocator( +<<<<<<< HEAD WithFd /*unused*/, +======= + WithFd, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::string_view filename, int fd, int flags, @@ -59,14 +63,22 @@ class TORCH_API MapAllocator { return flags_; } +<<<<<<< HEAD static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/); +======= + static MapAllocator* fromDataPtr(const at::DataPtr&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static at::DataPtr makeDataPtr( std::string_view filename, int flags, size_t size, size_t* actual_size_out); static at::DataPtr makeDataPtr( +<<<<<<< HEAD WithFd /*unused*/, +======= + WithFd, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const char* filename, int fd, int flags, @@ -105,13 +117,21 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck, public: RefcountedMapAllocator(const char* filename, int flags, size_t size); RefcountedMapAllocator( +<<<<<<< HEAD WithFd /*unused*/, +======= + WithFd, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const char* filename, int fd, int flags, size_t size); +<<<<<<< HEAD static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/); +======= + static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) RefcountedMapAllocator(const RefcountedMapAllocator&) = delete; RefcountedMapAllocator(RefcountedMapAllocator&&) = delete; RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete; @@ -122,7 +142,11 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck, size_t size, size_t* actual_size_out); static at::DataPtr makeDataPtr( +<<<<<<< HEAD WithFd /*unused*/, +======= + WithFd, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const char* filename, int fd, int flags, diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp index 1bc8c30158aec..004e35b82904c 100644 --- a/aten/src/ATen/MemoryOverlap.cpp +++ b/aten/src/ATen/MemoryOverlap.cpp @@ -24,7 +24,11 @@ MemOverlap has_internal_overlap(TensorImpl* t) { } } +<<<<<<< HEAD if (t->is_non_overlapping_and_dense_or_false()) { +======= + if (t->is_non_overlapping_and_dense()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return MemOverlap::No; } @@ -63,7 +67,11 @@ MemOverlapStatus get_overlap_status(const TensorImpl* a, const TensorImpl* b) { if (a->numel() == 0 || b->numel() == 0) { return MemOverlapStatus::No; } +<<<<<<< HEAD if (!a->is_non_overlapping_and_dense_or_false() || !b->is_non_overlapping_and_dense_or_false()) { +======= + if (!a->is_non_overlapping_and_dense() || !b->is_non_overlapping_and_dense()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return MemOverlapStatus::TooHard; } // Test for storage equality, rather than pointer equality. diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index 2de73a70dd332..6448c88c81e63 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -179,7 +179,11 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef return; } const auto src_names = src.names(); +<<<<<<< HEAD const auto result_dim = result.dim(); +======= + const auto result_dim = static_cast(result.dim()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto src_dim = static_cast(src_names.size()); const auto excluded_dim = static_cast(excluded_idxs.size()); TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim); diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h index c6198dccd2431..9f0388c7fccf0 100644 --- a/aten/src/ATen/NamedTensorUtils.h +++ b/aten/src/ATen/NamedTensorUtils.h @@ -167,14 +167,22 @@ TORCH_API TensorImpl* propagate_names( TORCH_API void propagate_names(TensorImpl* result, /*const */ TensorImpl* src); +<<<<<<< HEAD inline void propagate_names( +======= +TORCH_API inline void propagate_names( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const TensorBase& result, DimnameList names, bool validate_names = false) { propagate_names(result.unsafeGetTensorImpl(), names, validate_names); } +<<<<<<< HEAD inline void propagate_names_if_nonempty( +======= +TORCH_API inline void propagate_names_if_nonempty( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const TensorBase& result, DimnameList names, bool validate_names = false) { @@ -182,7 +190,13 @@ inline void propagate_names_if_nonempty( result.unsafeGetTensorImpl(), names, validate_names); } +<<<<<<< HEAD inline void propagate_names(const TensorBase& result, const TensorBase& src) { +======= +TORCH_API inline void propagate_names( + const TensorBase& result, + const TensorBase& src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) propagate_names(result.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); } diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp index ea951ed3db136..fcf4d5183d721 100644 --- a/aten/src/ATen/NestedTensorImpl.cpp +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -211,7 +211,11 @@ NestedTensorImpl::NestedTensorImpl( } // assume contiguous, `nested_strides` and `offsets` +<<<<<<< HEAD // can be inferred from `nested_sizes` +======= +// can be infered from `nested_sizes` +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) NestedTensorImpl::NestedTensorImpl( const at::Tensor& buffer, const at::Tensor& nested_sizes) @@ -273,7 +277,11 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const { return NestedTensorImpl::numel_custom(); } +<<<<<<< HEAD c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const { +======= +bool NestedTensorImpl::is_contiguous_custom(MemoryFormat) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return nested_tensor_impl_is_contiguous(this); } IntArrayRef NestedTensorImpl::sizes_custom() const { diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h index 9b92e9ec83ad2..56e931773068f 100644 --- a/aten/src/ATen/NestedTensorImpl.h +++ b/aten/src/ATen/NestedTensorImpl.h @@ -32,7 +32,11 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl { at::Tensor nested_strides, at::Tensor storage_offsets); // assume contiguous, `nested_strides` and `offsets` +<<<<<<< HEAD // can be inferred from `nested_sizes` +======= + // can be infered from `nested_sizes` +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) explicit NestedTensorImpl( const at::Tensor& buffer, const at::Tensor& nested_sizes); @@ -115,8 +119,12 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl { // with real implementations int64_t numel_custom() const override; c10::SymInt sym_numel_custom() const override; +<<<<<<< HEAD c10::SymBool sym_is_contiguous_custom( MemoryFormat /*memory_format*/) const override; +======= + bool is_contiguous_custom(MemoryFormat) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t size_custom(int64_t d) const override { return this->size(d); } diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index d09a33841b948..b16338dd9c6ac 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -14,7 +14,11 @@ inline int64_t divup(int64_t x, int64_t y) { TORCH_API void init_num_threads(); // Sets the number of threads to be used in parallel region +<<<<<<< HEAD TORCH_API void set_num_threads(int /*nthreads*/); +======= +TORCH_API void set_num_threads(int); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Returns the maximum number of threads that may be used in a parallel region TORCH_API int get_num_threads(); @@ -37,7 +41,11 @@ inline void lazy_init_num_threads() { } } +<<<<<<< HEAD TORCH_API void set_thread_num(int /*id*/); +======= +TORCH_API void set_thread_num(int); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class TORCH_API ThreadIdGuard { public: @@ -93,12 +101,20 @@ ident: identity for binary combination function sf. sf(ident, x) needs to return x. f: function for reduction over a chunk. f needs to be of signature scalar_t +<<<<<<< HEAD f(int64_t partial_begin, int64_t partial_end, scalar_t identify) +======= +f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sf: function to combine two partial results. sf needs to be of signature scalar_t sf(scalar_t x, scalar_t y) +<<<<<<< HEAD For example, you might have a tensor of 10000 entries and want to sum together +======= +For example, you might have a tensor of 10000 entires and want to sum together +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) all the elements. Parallel_reduce with a grain_size of 2500 will then allocate an intermediate result tensor with 4 elements. Then it will execute the function "f" you provide and pass the beginning and end index of these chunks, so @@ -130,7 +146,11 @@ inline scalar_t parallel_reduce( TORCH_API std::string get_parallel_info(); // Sets number of threads used for inter-op parallelism +<<<<<<< HEAD TORCH_API void set_num_interop_threads(int /*nthreads*/); +======= +TORCH_API void set_num_interop_threads(int); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Returns the number of threads used for inter-op parallelism TORCH_API size_t get_num_interop_threads(); diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp index e90065543e35b..bf1692a6166ca 100644 --- a/aten/src/ATen/PythonTorchFunctionTLS.cpp +++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp @@ -42,6 +42,7 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() { } bool torch_function_mode_enabled() { +<<<<<<< HEAD // Manually flatten because gcc is refusing to inline here. Note // that we are still calling __tls_get_addr twice here with GCC, // presumably because of @@ -50,6 +51,10 @@ bool torch_function_mode_enabled() { // performance. const auto& ptfs = pythonTorchFunctionState; return ptfs.disabled_state_ != TorchFunctionDisabledState::ALL_DISABLED && !ptfs.stack_.empty(); +======= + return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED && + PythonTorchFunctionTLS::stack_len() > 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // This is needed to disambiguate the ternary torch function disabled states diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h index 502bb535be050..e239ed73e487e 100644 --- a/aten/src/ATen/PythonTorchFunctionTLS.h +++ b/aten/src/ATen/PythonTorchFunctionTLS.h @@ -27,7 +27,10 @@ struct TORCH_API PythonTorchFunctionTLS { TorchFunctionDisabledState disabled_state_ = TorchFunctionDisabledState::ENABLED; std::vector> stack_; +<<<<<<< HEAD friend TORCH_API bool torch_function_mode_enabled(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; TORCH_API bool torch_function_mode_enabled(); diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp index 69d0c243156fa..08a8e01e62c6d 100644 --- a/aten/src/ATen/SavedTensorHooks.cpp +++ b/aten/src/ATen/SavedTensorHooks.cpp @@ -13,7 +13,11 @@ namespace { // and left at true for the rest of the execution. // It's an optimization so that users who never use default hooks don't need to // read the thread_local variables pack_hook_ and unpack_hook_. +<<<<<<< HEAD bool is_initialized(false); +======= + static bool is_initialized(false); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } static void assertSavedTensorHooksNotDisabled() { diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp index dec6d2e95960b..e71b0b9e9f9d8 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.cpp +++ b/aten/src/ATen/SparseCsrTensorImpl.cpp @@ -252,7 +252,14 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) { void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) { TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset."); } +<<<<<<< HEAD c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const { TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous"); } +======= +bool SparseCsrTensorImpl::is_contiguous_custom(MemoryFormat) const { + TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous"); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h index e764f954db33e..e94b6971c547d 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.h +++ b/aten/src/ATen/SparseCsrTensorImpl.h @@ -32,10 +32,17 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { public: explicit SparseCsrTensorImpl( +<<<<<<< HEAD at::DispatchKeySet /*key_set*/, at::Device device, Layout layout, const caffe2::TypeMeta /*data_type*/); +======= + at::DispatchKeySet, + at::Device device, + Layout layout, + const caffe2::TypeMeta); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void resize_(int64_t nnz, IntArrayRef size); void resize_and_clear_( @@ -86,8 +93,12 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { protected: IntArrayRef strides_custom() const override; SymIntArrayRef sym_strides_custom() const override; +<<<<<<< HEAD SymBool sym_is_contiguous_custom( MemoryFormat /*memory_format*/) const override; +======= + bool is_contiguous_custom(MemoryFormat) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) public: void set_size(int64_t dim, int64_t new_size) override; diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index a2c12fcfe8b9b..b4ac24f951e0c 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -46,9 +46,13 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { public: // Public for now... +<<<<<<< HEAD explicit SparseTensorImpl( at::DispatchKeySet /*key_set*/, const caffe2::TypeMeta /*data_type*/); +======= + explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void release_resources() override; @@ -135,12 +139,21 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { "resize_ called on tensor with symbolic shape") TORCH_CHECK( sparse_dim + dense_dim == static_cast(size.size()), +<<<<<<< HEAD "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", size.size(), ", sparse_dim = ", sparse_dim, ", dense_dim = ", dense_dim); +======= + "number of dimensions must be sparse_dim (", + sparse_dim, + ") + dense_dim (", + dense_dim, + "), but got ", + size.size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (nnz() > 0) { [[maybe_unused]] auto constexpr alt_options_msg = "You could try the following options:\n\ @@ -231,14 +244,22 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { } void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef size) { +<<<<<<< HEAD _resize_(sparse_dim, dense_dim, size); +======= + return _resize_(sparse_dim, dense_dim, size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void resize_( int64_t sparse_dim, int64_t dense_dim, ArrayRef size) { +<<<<<<< HEAD _resize_(sparse_dim, dense_dim, size); +======= + return _resize_(sparse_dim, dense_dim, size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // NOTE: this function will resize the sparse tensor and also set `indices` @@ -256,12 +277,21 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { "resize_and_clear_ called on tensor with symbolic shape") TORCH_CHECK( sparse_dim + dense_dim == static_cast(size.size()), +<<<<<<< HEAD "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", size.size(), ", sparse_dim = ", sparse_dim, ", dense_dim = ", dense_dim); +======= + "number of dimensions must be sparse_dim (", + sparse_dim, + ") + dense_dim (", + dense_dim, + "), but got ", + size.size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) set_sizes_and_strides(size, std::vector(size.size())); sparse_dim_ = sparse_dim; @@ -386,8 +416,13 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { private: explicit SparseTensorImpl( +<<<<<<< HEAD at::DispatchKeySet /*key_set*/, const caffe2::TypeMeta /*data_type*/, +======= + at::DispatchKeySet, + const caffe2::TypeMeta, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::Tensor indices, at::Tensor values); diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp index 1fa852686656f..603216cca9cc7 100644 --- a/aten/src/ATen/TensorIndexing.cpp +++ b/aten/src/ATen/TensorIndexing.cpp @@ -59,7 +59,11 @@ static inline void set_item(const Tensor& self, ArrayRef indices, c } } +<<<<<<< HEAD set_item(self, indices, value); +======= + return set_item(self, indices, value); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace indexing diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 9291d2e66e5f5..f5055d32e40fa 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -112,10 +112,17 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice); // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})` struct TORCH_API TensorIndex final { // Case 1: `at::indexing::None` +<<<<<<< HEAD TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {} // Case 2: "..." / `at::indexing::Ellipsis` TensorIndex(at::indexing::EllipsisIndexType /*unused*/) +======= + TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {} + + // Case 2: "..." / `at::indexing::Ellipsis` + TensorIndex(at::indexing::EllipsisIndexType) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : type_(TensorIndexType::Ellipsis) {} TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) { TORCH_CHECK_VALUE( @@ -214,7 +221,11 @@ inline Tensor applySlice( "step must be greater than zero"); // See NOTE [nested tensor size for indexing] +<<<<<<< HEAD if (self_sizes.has_value() && !self_sizes.value().empty()) { +======= + if (self_sizes.has_value()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Skip this optimization if we are tracing, as the trace may be polymorphic // over the shape of the `self` tensor, and we still want to record // the slice. @@ -223,7 +234,11 @@ inline Tensor applySlice( : self.sym_size(dim); if (!disable_slice_optimization && TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) && +<<<<<<< HEAD TORCH_STATICALLY_KNOWN_TRUE(length.sym_le(stop)) && step == 1) { +======= + TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self; } } @@ -252,7 +267,11 @@ inline Tensor applySelect( // Note: `size >= -index` is not equivalent to `size > -1 - index` if index // is INT64_MIN For std::numeric_limits::min() result of unary // minus is undefined by the standard but in practice is equal to self. On +<<<<<<< HEAD // the other hand, indexing wrapping is valid for all negative int64_t +======= + // the other hand, indexing wraping is valid for all negative int64_t +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // values, as x[INT64_MIN] is the same as x[INT64_MAX] TORCH_CHECK_INDEX( size.sym_gt(-1 - index) @@ -315,6 +334,7 @@ inline void recordTensorIndex( const Tensor& tensor, std::vector& outIndices, int64_t* dim_ptr) { +<<<<<<< HEAD if (outIndices.empty()) { outIndices.resize(*dim_ptr + 1); outIndices[*dim_ptr] = tensor; @@ -326,6 +346,12 @@ inline void recordTensorIndex( } else { *dim_ptr += 1; } +======= + // TODO: check scalarType + outIndices.resize(*dim_ptr + 1); + outIndices[*dim_ptr] = tensor; + (*dim_ptr)++; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline c10::List<::std::optional> typeConvertIndices( @@ -465,6 +491,7 @@ inline Tensor handleDimInMultiDimIndexing( original_tensor_device, prev_dim_result_sizes); (*dim_ptr)++; +<<<<<<< HEAD if (!outIndices.empty()) { outIndices.resize(outIndices.size() + 1); } @@ -475,13 +502,21 @@ inline Tensor handleDimInMultiDimIndexing( if (!outIndices.empty()) { outIndices.resize(outIndices.size() + ellipsis_ndims); } +======= + return result; + } else if (index.is_ellipsis()) { + (*dim_ptr) += original_tensor.dim() - (*specified_dims_ptr); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return prev_dim_result; } else if (index.is_none()) { Tensor result = prev_dim_result.unsqueeze(*dim_ptr); (*dim_ptr)++; +<<<<<<< HEAD if (!outIndices.empty()) { outIndices.resize(outIndices.size() + 1); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return result; } else if (index.is_boolean()) { Tensor result = prev_dim_result.unsqueeze(*dim_ptr); @@ -577,10 +612,13 @@ inline Tensor applySlicing( inline Tensor dispatch_index( const Tensor& self, std::vector&& indices) { +<<<<<<< HEAD // Remove trailing null elements from indices while (!indices.empty() && !indices.back().defined()) { indices.pop_back(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self.index(impl::typeConvertIndices(self, std::move(indices))); } @@ -588,10 +626,13 @@ inline Tensor dispatch_index_put_( Tensor& self, std::vector&& indices, const Tensor& value) { +<<<<<<< HEAD // Remove trailing null elements from indices while (!indices.empty() && !indices.back().defined()) { indices.pop_back(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self.index_put_( impl::typeConvertIndices(self, std::move(indices)), value); } diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index b10d5c7d1fc3f..f9c28ed832b88 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -56,7 +56,11 @@ inline void get_strides(int64_t* strides, ArrayRef operands, int64_ } } +<<<<<<< HEAD OptionalTensorRef make_otr(const TensorBase &tensor) { +======= +static OptionalTensorRef make_otr(const TensorBase &tensor) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (tensor.defined()) { return OptionalTensorRef(tensor); } else { @@ -208,7 +212,11 @@ bool TensorIteratorConfig::is_tensor_const(size_t idx) { // same strides are increasing. If dimensions are non-increasing, we move on to the next input to break the tie. // // Instead of applying rule 4 for tie breaking, we could move on to the next tensor directly. This would result in possibly +<<<<<<< HEAD // losing the correct permutation of the first tensor if there are permuted trivial dimensions, but could potentially +======= +// losing the correct permuation of the first tensor if there are permuted trivial dimensions, but could potentially +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // improve traversal order of the second tensor. We chose the former option to better propagate channels last layout // for example for a tensor with the sizes N1H1 // These rules result in the intuitive behavior that in most cases recovers permutation of either the first argument (if all @@ -244,7 +252,11 @@ void TensorIteratorBase::reorder_dimensions() { // initialize perm with n-1, n-2, ..., 1, 0 std::iota(perm_.rbegin(), perm_.rend(), 0); +<<<<<<< HEAD // Reordering dimensions changes iteration order +======= + // Reordering dimensions changes iteraton order +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (enforce_linear_iteration_) { permute_dimensions(perm_); return; @@ -765,8 +777,12 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) { if (numel == 0) { return; } else if (numel < grain_size || at::get_num_threads() == 1) { +<<<<<<< HEAD serial_for_each(loop, {0, numel}); return; +======= + return serial_for_each(loop, {0, numel}); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) { serial_for_each(loop, {begin, end}); @@ -1534,7 +1550,11 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer. // Nothing beyond this point is important for meta functions, so it's fine to exit early here. +<<<<<<< HEAD // Extend the condition to MAIA tensors as MAIA tensors also don't have storage. +======= + // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (privateuse1_without_storage || common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::IPU || diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index d8593a80292b3..dac89cece9c80 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -250,7 +250,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { using PtrVector = SmallVector; using StrideVector = SmallVector; +<<<<<<< HEAD void build(TensorIteratorConfig& /*config*/); +======= + void build(TensorIteratorConfig&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // The inner-loop function operates on the fastest moving dimension. It // implements element-wise operations in terms of 1-d strided tensors. @@ -388,7 +392,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { /// Return scalar value from original_tensor_base if it is defined. When /// common_dtype is Half, casting scalar input to common_dtype might overflow. +<<<<<<< HEAD /// If the scalar is already given in the type of Half, then return scalar +======= + /// If the scalar is aleady given in the type of Half, then return scalar +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /// value from tensor_base. template T original_scalar_value(int64_t arg) { @@ -502,7 +510,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { /// kernels bool can_use_32bit_indexing() const; +<<<<<<< HEAD /// An "iterable" object that recursively splits this iterator into +======= + /// An "iteratable" object that recursively splits this iterator into +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /// sub-iterators that can use 32-bit indexing. SplitUntil32Bit with_32bit_indexing() const; @@ -618,6 +630,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { #undef TORCH_DISALLOW_TEMPORARIES protected: // Mutable reference as it moves tensors out of TensorIteratorConfig +<<<<<<< HEAD void populate_operands(TensorIteratorConfig& /*config*/); void mark_outputs(); void mark_resize_outputs(const TensorIteratorConfig& /*config*/); @@ -632,6 +645,22 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { bool fast_set_up(const TensorIteratorConfig& /*config*/); FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/); void compute_names(const TensorIteratorConfig& /*config*/); +======= + void populate_operands(TensorIteratorConfig&); + void mark_outputs(); + void mark_resize_outputs(const TensorIteratorConfig&); + void compute_mem_overlaps(const TensorIteratorConfig&); + void compute_shape(const TensorIteratorConfig&); + void compute_strides(const TensorIteratorConfig&); + void reorder_dimensions(); + void permute_dimensions(IntArrayRef perm); + void compute_types(const TensorIteratorConfig&); + ScalarType compute_common_dtype(); + void allocate_or_resize_outputs(); + bool fast_set_up(const TensorIteratorConfig&); + FastSetupType compute_fast_setup_type(const TensorIteratorConfig&); + void compute_names(const TensorIteratorConfig&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void propagate_names_to_outputs(); void coalesce_dimensions(); @@ -878,7 +907,11 @@ class TORCH_API TensorIteratorConfig final { // Sets the enforce_linear_iteration_ flag, which is false by default. // If true, iteration goes in the same order as a C-contiguous tensor +<<<<<<< HEAD // is laid out in memory. i.e. last dimension iterates fastest. +======= + // is layed out in memory. i.e. last dimension iterates fastest. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // This iteration order can be less efficient and may even prevent // vectorization. So only use if the correctness of your kernel depends on it. diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h index 515642a0c51d2..73c3b3c2d87b2 100644 --- a/aten/src/ATen/TensorSubclassLikeUtils.h +++ b/aten/src/ATen/TensorSubclassLikeUtils.h @@ -78,7 +78,11 @@ inline bool areAnyOptionalTensorSubclassLike( // NOTE: This function expects a scalar tensor of boolean dtype. // Eg. // Non-Composite Compliant Pattern : (t == 0).all().item() +<<<<<<< HEAD // Composite Compliant Pattern : is_salar_tensor_true((t == 0).all()) +======= +// Composite Compliant Patter : is_salar_tensor_true((t == 0).all()) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline bool is_scalar_tensor_true(const Tensor& t) { TORCH_INTERNAL_ASSERT(t.dim() == 0) TORCH_INTERNAL_ASSERT(t.scalar_type() == kBool) diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 8236751679f06..68466ef915e41 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -273,11 +273,19 @@ void checkLayout(CheckedFrom c, at::ArrayRef tensors, at::Layout layout) } void * maybe_data_ptr(const Tensor& tensor) { +<<<<<<< HEAD return tensor.defined() ? tensor.data_ptr() : nullptr; } void * maybe_data_ptr(const TensorArg& tensor) { return tensor->defined() ? tensor->data_ptr() : nullptr; +======= + return tensor.defined() ? (void *)tensor.data_ptr() : nullptr; +} + +void * maybe_data_ptr(const TensorArg& tensor) { + return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void check_dim_size( @@ -378,9 +386,15 @@ inline static std::optional computeStride_impl( (TORCH_GUARD_OR_TRUE(sym_ne(oldshape[tensor_d - 1], 1)) && TORCH_GUARD_OR_TRUE(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) { // We want to accumulate stuff in view_numel until view_numel == tensor_numel, if we do not +<<<<<<< HEAD // know if that is satisfied we keep accumulating. For example if view_numel = 1 and tensor_numel = u1, // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop. // That's why we use TORCH_GUARD_OR_TRUE below. +======= + // know if that is satisfied we keep accumalating. For example if view_numel = 1 and tensor_numel = u1, + // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop. + // Thats why we use TORCH_GUARD_OR_TRUE below. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // we use TORCH_GUARD_OR_FALSE and not TORCH_GUARD_OR_TRUE when comparing newshape[view_d] ==1 because // if we know view_numel < tensor_numel is false, we want to stop. Unless we know for sure newshape[view_d]==1 diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 22509c7be4e19..bfdd8f9bf51c5 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -8,7 +8,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at { @@ -20,7 +23,10 @@ ThreadLocalState::ThreadLocalState() torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()), python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()), saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()), +<<<<<<< HEAD dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()), +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) { #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER) for(size_t i=0; i>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_); c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_); diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index d0d8112fc4cec..7728f99799d60 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -75,8 +75,11 @@ class TORCH_API ThreadLocalState { bool functionalization_reapply_views_state_; +<<<<<<< HEAD bool dtensor_allow_implicit_replication_; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TLS for arbitrary python objects that is registered via hooks at::impl::ThreadLocalPythonObjects saved_objects_; diff --git a/aten/src/ATen/TracerMode.h b/aten/src/ATen/TracerMode.h index d0d4c93a84f53..bffd0abda1533 100644 --- a/aten/src/ATen/TracerMode.h +++ b/aten/src/ATen/TracerMode.h @@ -27,7 +27,11 @@ // ops (ops being called by other ops). After the intermediate op call // finishes it's set back to the original `TracingState` object. // +<<<<<<< HEAD // The `TracingState` object in TLS can also be read/written via its Python +======= +// The `TracingState` obect in TLS can also be read/written via its Python +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // binding in `python_tracer.cpp`, and `get/setTracingState()` C++ APIs, // which are also exposed as `TORCH_API`. // diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index e9c936b906c67..a37d1bb99842f 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -20,7 +20,11 @@ namespace at { +<<<<<<< HEAD TORCH_API int _crash_if_asan(int /*arg*/); +======= +TORCH_API int _crash_if_asan(int); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Converts a TensorList (i.e. ArrayRef to vector of TensorImpl*) // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat. diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index 7239f357fdd64..5bc698b6cb3a7 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -95,6 +95,7 @@ std::string get_cpu_capability() { // environment variable auto capability = native::get_cpu_capability(); switch (capability) { +<<<<<<< HEAD case native::CPUCapability::DEFAULT: return "DEFAULT"; #if defined(HAVE_VSX_CPU_DEFINITION) @@ -107,6 +108,26 @@ std::string get_cpu_capability() { case native::CPUCapability::SVE256: return "SVE256"; #else +======= +#if defined(HAVE_VSX_CPU_DEFINITION) + case native::CPUCapability::DEFAULT: + return "DEFAULT"; + case native::CPUCapability::VSX: + return "VSX"; +#elif defined(HAVE_ZVECTOR_CPU_DEFINITION) + case native::CPUCapability::DEFAULT: + return "DEFAULT"; + case native::CPUCapability::ZVECTOR: + return "Z VECTOR"; +#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION) + case native::CPUCapability::DEFAULT: + return "DEFAULT"; + case native::CPUCapability::SVE256: + return "SVE256"; +#else + case native::CPUCapability::DEFAULT: + return "NO AVX"; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case native::CPUCapability::AVX2: return "AVX2"; case native::CPUCapability::AVX512: diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index aa000b118daa2..a4c5d1e5b1224 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -121,7 +121,11 @@ inline int64_t legacy_cat_wrap_dim_symint( const std::vector>& tensor_sizes) { for (auto& sizes : tensor_sizes) { if (sizes.size() == 1) { +<<<<<<< HEAD if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue; } } @@ -135,7 +139,11 @@ inline int64_t legacy_cat_wrap_dim( const MaterializedITensorListRef& tensors) { for (const Tensor& tensor : tensors) { if (tensor.dim() == 1) { +<<<<<<< HEAD if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) continue; } } diff --git a/aten/src/ATen/ZeroTensorFallback.cpp b/aten/src/ATen/ZeroTensorFallback.cpp index 40b34030b85b9..d29d2c981ad26 100644 --- a/aten/src/ATen/ZeroTensorFallback.cpp +++ b/aten/src/ATen/ZeroTensorFallback.cpp @@ -9,6 +9,7 @@ namespace at { +<<<<<<< HEAD /* * Design: * 1. ZeroTensors are regular tensors with TensorOptions, a storage @@ -39,6 +40,9 @@ namespace at { * it does not perfectly handle NaNs and Infs as we don't check the actual values * and assume that they are non-zero, non-inf, non-NaN etc. */ +======= + // TODO: add a note explaining the design decisions +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { const auto& arguments = op.schema().arguments(); @@ -124,7 +128,11 @@ namespace at { m.impl("clone", torch::CppFunction::makeFallthrough()); m.impl("dot", torch::CppFunction::makeFallthrough()); m.impl("vdot", torch::CppFunction::makeFallthrough()); +<<<<<<< HEAD // The functions in the list below have a specific registration in native_functions.yaml and +======= + // The functions in the list below have a specific registeration in native_functions.yaml and +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // do not use the fallback. // m.impl("mul.Tensor", torch::CppFunction::makeFallthrough()); // m.impl("add.Tensor", torch::CppFunction::makeFallthrough()); diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index e3424cc4cb8eb..a2d3affa24bbc 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -36,7 +36,11 @@ namespace { using weakref_type = c10::weak_intrusive_ptr; using val_type = std::tuple; +<<<<<<< HEAD ska::flat_hash_map& get_cached_casts() { +======= +static ska::flat_hash_map& get_cached_casts() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static ska::flat_hash_map cached_casts; return cached_casts; } @@ -148,7 +152,11 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_ Banned functions *******************************/ +<<<<<<< HEAD static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional& /*unused*/, int64_t /*unused*/) { +======= +static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional&, int64_t) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n" "Many models use a sigmoid layer right before the binary cross entropy layer.\n" "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n" @@ -216,7 +224,10 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) { KERNEL_MPS(_convolution, lower_precision_fp) KERNEL_MPS(conv1d, lower_precision_fp) KERNEL_MPS(conv2d, lower_precision_fp) +<<<<<<< HEAD KERNEL_MPS(conv3d, lower_precision_fp) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) KERNEL_MPS(conv_tbc, lower_precision_fp) KERNEL_MPS(conv_transpose1d, lower_precision_fp) KERNEL_MPS(conv_transpose2d, input, lower_precision_fp) @@ -240,7 +251,10 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) { KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp) // fp32 +<<<<<<< HEAD KERNEL_MPS(conv_transpose3d, input, fp32) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) KERNEL_MPS(acos, fp32) KERNEL_MPS(asin, fp32) KERNEL_MPS(cosh, fp32) diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h index 655b2343d5d5c..96d719004904c 100644 --- a/aten/src/ATen/autocast_mode.h +++ b/aten/src/ATen/autocast_mode.h @@ -25,7 +25,11 @@ TORCH_API void set_autocast_cache_enabled(bool enabled); // deprecated CUDA-specific autocast APIs C10_DEPRECATED_MESSAGE( "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.") +<<<<<<< HEAD inline bool is_enabled() { +======= +TORCH_API inline bool is_enabled() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( "at::autocast::", __func__, @@ -34,7 +38,11 @@ inline bool is_enabled() { } C10_DEPRECATED_MESSAGE( "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.") +<<<<<<< HEAD inline void set_enabled(bool enabled) { +======= +TORCH_API inline void set_enabled(bool enabled) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( "at::autocast::", __func__, @@ -43,7 +51,11 @@ inline void set_enabled(bool enabled) { } C10_DEPRECATED_MESSAGE( "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.") +<<<<<<< HEAD inline at::ScalarType get_autocast_gpu_dtype() { +======= +TORCH_API inline at::ScalarType get_autocast_gpu_dtype() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( "at::autocast::", __func__, @@ -52,7 +64,11 @@ inline at::ScalarType get_autocast_gpu_dtype() { } C10_DEPRECATED_MESSAGE( "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.") +<<<<<<< HEAD inline void set_autocast_gpu_dtype(at::ScalarType dtype) { +======= +TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( "at::autocast::", __func__, @@ -65,7 +81,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) { "at::autocast::is_" #name \ "_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type \ ") instead.") \ +<<<<<<< HEAD inline bool is_##name##_enabled() { \ +======= + TORCH_API inline bool is_##name##_enabled() { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( \ "at::autocast::", \ __func__, \ @@ -78,7 +98,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) { "at::autocast::set_" #name \ "_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \ ", enabled) instead.") \ +<<<<<<< HEAD inline void set_##name##_enabled(bool enabled) { \ +======= + TORCH_API inline void set_##name##_enabled(bool enabled) { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( \ "at::autocast::", \ __func__, \ @@ -91,7 +115,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) { "at::autocast::get_autocast_" #name \ "_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type \ ") instead.") \ +<<<<<<< HEAD inline at::ScalarType get_autocast_##name##_dtype() { \ +======= + TORCH_API inline at::ScalarType get_autocast_##name##_dtype() { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( \ "at::autocast::", \ __func__, \ @@ -104,7 +132,11 @@ inline void set_autocast_gpu_dtype(at::ScalarType dtype) { "at::autocast::set_autocast_" #name \ "_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type \ ", dtype) instead.") \ +<<<<<<< HEAD inline void set_autocast_##name##_dtype(at::ScalarType dtype) { \ +======= + TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_DEPRECATION( \ "at::autocast::", \ __func__, \ @@ -377,7 +409,11 @@ Keep it simple for now by assuming only one such flag is present in the argument list. If I ever need a function with more than flag I'll figure out something else. The policy is: +<<<<<<< HEAD If the user has explicitly specified a dtype, respect it. +======= +If the user has explicity specified a dtype, respect it. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Otherwise, set it to the autocast type. ********************************************************/ diff --git a/aten/src/ATen/core/CachingHostAllocator.cpp b/aten/src/ATen/core/CachingHostAllocator.cpp index f3ddaedc5ecd6..a6428259d7f18 100644 --- a/aten/src/ATen/core/CachingHostAllocator.cpp +++ b/aten/src/ATen/core/CachingHostAllocator.cpp @@ -6,9 +6,15 @@ namespace at { namespace { +<<<<<<< HEAD std::array allocator_array{}; std::array +======= +static std::array + allocator_array{}; +static std::array +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) allocator_priority{}; } // anonymous namespace diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index 603e7e73bc1ea..66c8989fb133b 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -1,12 +1,18 @@ #pragma once #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -39,7 +45,11 @@ struct HostBlock { }; template +<<<<<<< HEAD struct alignas(hardware_destructive_interference_size) FreeBlockList { +======= +struct alignas(64) FreeBlockList { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::mutex mutex_; std::deque list_; }; @@ -50,6 +60,7 @@ namespace { constexpr size_t MAX_SIZE_INDEX = 64; } +<<<<<<< HEAD // A large reserved pinned memory segment that is created in advance which is used // to allocate small pinned memory requests to avoid calling into expensive APIs. // We never free this memory and move up the pointer as we allocate new blocks @@ -101,6 +112,21 @@ struct TORCH_API HostStats { // SUM: bytes allocated/reserved by this memory allocator. This accounts // for both free and in-use blocks. Stat allocated_bytes; +======= +// Struct containing memory allocator summary statistics for host. +struct TORCH_API HostStats { + // COUNT: allocations requested by client code. Note that active + // count can be extracted by looking at current allocations + Stat allocation; + // COUNT: number of allocated segments from host memory allocation. + Stat segment; + + // SUM: bytes allocated by this memory alocator. Note that active bytes + // can be extracted by looking at current bytes allocated + Stat allocated_bytes; + // SUM: bytes reserved by this memory allocator (both free and used) + Stat reserved_bytes; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds DurationStat host_alloc_time; @@ -114,14 +140,18 @@ struct TORCH_API HostStats { // COUNT: number of times cudaHostFree/cudaHostUnregister was called. int64_t num_host_free = 0; // This is derived from segment or timing +<<<<<<< HEAD // Count of cudaHostAlloc/cudaHostRegister per bucket std::vector bucket_allocation = std::vector(MAX_SIZE_INDEX); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; // Struct containing memory allocator summary statistics for host, as they // are staged for reporting. This is a temporary struct that is used to // avoid locking the allocator while collecting stats. +<<<<<<< HEAD struct alignas(hardware_destructive_interference_size) HostStatsStaged { std::mutex timing_mutex_; // COUNT: total allocations (active + free) @@ -140,6 +170,21 @@ struct alignas(hardware_destructive_interference_size) HostStatsStaged { // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ std::vector allocation_bucket_stats = std::vector(MAX_SIZE_INDEX); // SUM: bytes of allocation per bucket (active + free) +======= +struct alignas(64) HostStatsStaged { + std::mutex timing_mutex_; + // COUNT: allocations requested by client code resulting in a new segment/block allocation + // LOCK: access to this stat is protected by the allocator's blocks_mutex_ + Stat allocation; + // SUM: bytes within active memory blocks, including blocks that are + // currently in the free list. + // LOCK: access to this stat is protected by the allocator's blocks_mutex_ + Stat allocated_bytes; + // COUNT: number of allocations per bucket + // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ + std::vector allocation_bucket_stats = std::vector(MAX_SIZE_INDEX); + // SUM: bytes of allocation per bucket +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ std::vector allocated_bytes_bucket_stats = std::vector(MAX_SIZE_INDEX); // SUM: time spent in cudaHostAlloc/cudaHostRegister @@ -258,6 +303,15 @@ struct CachingHostAllocatorImpl { // Check in the recently freed blocks with pending events to see if we // can reuse them. Call get_free_block again after processing events if (pinned_use_background_threads()) { +<<<<<<< HEAD +======= + process_events_for_specific_size(roundSize); + block = get_free_block(roundSize); + if (block) { + return {block->ptr_, reinterpret_cast(block)}; + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Launch the background thread and process events in a loop. static bool background_thread_flag [[maybe_unused]] = [this] { getBackgroundThreadPool()->run([&]() { @@ -293,7 +347,10 @@ struct CachingHostAllocatorImpl { auto* block = reinterpret_cast(ctx); std::optional> events; +<<<<<<< HEAD ska::flat_hash_set streams; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { std::lock_guard g(block->mutex_); block->allocated_ = false; @@ -302,23 +359,38 @@ struct CachingHostAllocatorImpl { } else { events = std::vector(); events->reserve(block->streams_.size()); +<<<<<<< HEAD block->event_count_ += block->streams_.size(); // Move out streams to avoid holding the mutex during event recording streams = std::move(block->streams_); +======= + for (auto stream : block->streams_) { + record_stream(events, stream); + } + block->event_count_ += events->size(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) block->streams_.clear(); } } +<<<<<<< HEAD // Event recording must be done outside the mutex to avoid potential // deadlocks (e.g., when Python GIL is involved) for (auto stream : streams) { record_stream(events, stream); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!events) { auto index = size_index(block->size_); std::lock_guard g(free_list_[index].mutex_); free_list_[index].list_.push_back(block); +<<<<<<< HEAD +======= + stats_.allocation_bucket_stats[index].decrease(1); + stats_.allocated_bytes_bucket_stats[index].decrease(block->size_); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { // restore these events that record by used streams. std::lock_guard g(events_mutex_); @@ -378,12 +450,18 @@ struct CachingHostAllocatorImpl { for (auto* block : blocks_to_remove) { blocks_.erase(block); ptr_to_block_.erase(block->ptr_); +<<<<<<< HEAD auto index = size_index(block->size_); free_block(block); stats_.allocations.decrease(1); stats_.allocated_bytes.decrease(block->size_); stats_.allocation_bucket_stats[index].decrease(1); stats_.allocated_bytes_bucket_stats[index].decrease(block->size_); +======= + stats_.allocation.decrease(1); + stats_.allocated_bytes.decrease(block->size_); + free_block(block); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) delete block; } } @@ -394,8 +472,12 @@ struct CachingHostAllocatorImpl { } virtual bool pinned_use_background_threads() { +<<<<<<< HEAD return c10::CachingAllocator::AcceleratorAllocatorConfig:: pinned_use_background_threads(); +======= + return false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const { @@ -430,17 +512,29 @@ struct CachingHostAllocatorImpl { // per bucket (we pick index 0 arbitrarily). These are also all the host // allocations, not taking into account caching and free lists. if (i == 0) { +<<<<<<< HEAD stats.allocations = stats_.allocations; stats.allocated_bytes = stats_.allocated_bytes; stats.num_host_alloc = stats.allocations.allocated; stats.num_host_free = stats.allocations.freed; +======= + stats.segment = stats_.allocation; + stats.reserved_bytes = stats_.allocated_bytes; + stats.num_host_alloc = stats.segment.allocated; + stats.num_host_free = stats.segment.freed; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Bucket stats need to be merged with the slow-path stats. We do this in // a best effort manner, since we can't really replay the cached events per bucket. +<<<<<<< HEAD add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]); add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]); stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated; +======= + add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]); + add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Get the timing stats @@ -455,7 +549,11 @@ struct CachingHostAllocatorImpl { } void resetAccumulatedStats() { +<<<<<<< HEAD // Resetting accumulated memory stats requires concurrently holding both the +======= + // Reseting accumulated memory stats requires concurrently holding both the +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // free list mutexes and the blocks mutex. Previously, this was only done in // empty_cache function. for (size_t i = 0; i < free_list_.size(); ++i) { @@ -464,11 +562,17 @@ struct CachingHostAllocatorImpl { std::lock_guard gb(blocks_mutex_, std::adopt_lock); if (i == 0) { +<<<<<<< HEAD stats_.allocations.reset_accumulated(); stats_.allocated_bytes.reset_accumulated(); } stats_.active_bucket_stats[i].reset_accumulated(); stats_.active_bytes_bucket_stats[i].reset_accumulated(); +======= + stats_.allocation.reset_accumulated(); + stats_.allocated_bytes.reset_accumulated(); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) stats_.allocation_bucket_stats[i].reset_accumulated(); stats_.allocated_bytes_bucket_stats[i].reset_accumulated(); } @@ -482,7 +586,11 @@ struct CachingHostAllocatorImpl { } void resetPeakStats() { +<<<<<<< HEAD // Resetting peak memory stats requires concurrently holding both the +======= + // Reseting peak memory stats requires concurrently holding both the +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // free list mutexes and the blocks mutex. Previously, this was only done in // empty_cache function. for (size_t i = 0; i < free_list_.size(); ++i) { @@ -491,11 +599,17 @@ struct CachingHostAllocatorImpl { std::lock_guard gb(blocks_mutex_, std::adopt_lock); if (i == 0) { +<<<<<<< HEAD stats_.allocations.reset_peak(); stats_.allocated_bytes.reset_peak(); } stats_.active_bucket_stats[i].reset_peak(); stats_.active_bytes_bucket_stats[i].reset_peak(); +======= + stats_.allocation.reset_peak(); + stats_.allocated_bytes.reset_peak(); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) stats_.allocation_bucket_stats[i].reset_peak(); stats_.allocated_bytes_bucket_stats[i].reset_peak(); } @@ -512,7 +626,11 @@ struct CachingHostAllocatorImpl { virtual void add_allocated_block(B* block) { std::lock_guard g(blocks_mutex_); blocks_.insert(block); +<<<<<<< HEAD stats_.allocations.increase(1); +======= + stats_.allocation.increase(1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) stats_.allocated_bytes.increase(block->size_); ptr_to_block_.insert({block->ptr_, block}); @@ -525,8 +643,11 @@ struct CachingHostAllocatorImpl { std::lock_guard g(free_list_[index].mutex_); stats_.allocation_bucket_stats[index].increase(1); stats_.allocated_bytes_bucket_stats[index].increase(size); +<<<<<<< HEAD stats_.active_bucket_stats[index].increase(1); stats_.active_bytes_bucket_stats[index].increase(size); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } @@ -537,8 +658,13 @@ struct CachingHostAllocatorImpl { B* block = free_list_[index].list_.back(); free_list_[index].list_.pop_back(); block->allocated_ = true; +<<<<<<< HEAD stats_.active_bucket_stats[index].increase(1); stats_.active_bytes_bucket_stats[index].increase(size); +======= + stats_.allocation_bucket_stats[index].increase(1); + stats_.allocated_bytes_bucket_stats[index].increase(size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return block; } return nullptr; @@ -632,8 +758,13 @@ struct CachingHostAllocatorImpl { auto index = size_index(block->size_); std::lock_guard g(free_list_[index].mutex_); free_list_[index].list_.push_back(block); +<<<<<<< HEAD stats_.active_bucket_stats[index].decrease(1); stats_.active_bytes_bucket_stats[index].decrease(size); +======= + stats_.allocation_bucket_stats[index].decrease(1); + stats_.allocated_bytes_bucket_stats[index].decrease(size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (size != -1) { return; } @@ -669,7 +800,11 @@ struct CachingHostAllocatorImpl { TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event"); } +<<<<<<< HEAD alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_; +======= + alignas(64) std::mutex blocks_mutex_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ska::flat_hash_set blocks_; // block list ska::flat_hash_map ptr_to_block_; @@ -677,17 +812,28 @@ struct CachingHostAllocatorImpl { // size. This allows us to quickly find a free block of the right size. // We use deque to store per size free list and guard the list with its own // mutex. +<<<<<<< HEAD alignas(hardware_destructive_interference_size) std::vector> free_list_{MAX_SIZE_INDEX}; alignas(hardware_destructive_interference_size) std::mutex events_mutex_; +======= + alignas(64) std::vector> free_list_ = + std::vector>(MAX_SIZE_INDEX); + + alignas(64) std::mutex events_mutex_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::deque> events_; // event queue paired with block // Indicates whether the object is active. // Set to false in the destructor to signal background threads to stop. std::atomic active_{true}; protected: +<<<<<<< HEAD alignas(hardware_destructive_interference_size) HostStatsStaged stats_; +======= + alignas(64) HostStatsStaged stats_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct TORCH_API HostAllocator : public at::Allocator { diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index 679308872989d..90b21fd630fd7 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -59,7 +59,13 @@ struct TORCH_API Generator { explicit Generator(c10::intrusive_ptr gen_impl) : impl_(std::move(gen_impl)) { +<<<<<<< HEAD TORCH_CHECK(impl_.get(), "GeneratorImpl with nullptr is not supported"); +======= + if (impl_.get() == nullptr) { + throw std::runtime_error("GeneratorImpl with nullptr is not supported"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool operator==(const Generator& rhs) const { diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp index 0bbeb9ddc13ae..3ee25c0edc805 100644 --- a/aten/src/ATen/core/NamedTensor.cpp +++ b/aten/src/ATen/core/NamedTensor.cpp @@ -49,7 +49,11 @@ static void check_unique_names(DimnameList names) { } void check_names_valid_for(const TensorBase& tensor, DimnameList names) { +<<<<<<< HEAD impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names); +======= + return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void check_names_valid_for(size_t tensor_dim, DimnameList names) { diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h index 52acae90b1280..759f7eb3ff8d7 100644 --- a/aten/src/ATen/core/NamedTensor.h +++ b/aten/src/ATen/core/NamedTensor.h @@ -27,11 +27,19 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface { HasNonWildcard }; +<<<<<<< HEAD explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names) : names_(names.vec()) { check_invariants(); } explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector&& names) +======= + explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names) + : names_(names.vec()) { + check_invariants(); + } + explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector&& names) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : names_(std::move(names)) { check_invariants(); } @@ -52,13 +60,21 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface { std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); })); } +<<<<<<< HEAD void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) { +======= + void set_names(HAS_NON_WILDCARD, DimnameList new_names) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(new_names.size() == names_.size()); std::copy(new_names.begin(), new_names.end(), names_.begin()); check_invariants(); } +<<<<<<< HEAD void set_names(HAS_NON_WILDCARD /*unused*/, std::vector&& new_names) { +======= + void set_names(HAS_NON_WILDCARD, std::vector&& new_names) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(new_names.size() == names_.size()); names_ = std::move(new_names); check_invariants(); diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h index e8bac545933ca..9d967203751ca 100644 --- a/aten/src/ATen/core/PhiloxRNGEngine.h +++ b/aten/src/ATen/core/PhiloxRNGEngine.h @@ -229,10 +229,17 @@ class philox_engine { } +<<<<<<< HEAD static constexpr uint32_t kPhilox10A = 0x9E3779B9; static constexpr uint32_t kPhilox10B = 0xBB67AE85; static constexpr uint32_t kPhiloxSA = 0xD2511F53; static constexpr uint32_t kPhiloxSB = 0xCD9E8D57; +======= + static const uint32_t kPhilox10A = 0x9E3779B9; + static const uint32_t kPhilox10B = 0xBB67AE85; + static const uint32_t kPhiloxSA = 0xD2511F53; + static const uint32_t kPhiloxSB = 0xCD9E8D57; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; typedef philox_engine Philox4_32; diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 39f4e7cb69764..f5a0484db4c25 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -2,7 +2,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { @@ -54,24 +57,38 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_ TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value()); // StashTLSOnEntryGuard stash_guard; +<<<<<<< HEAD c10::impl::ExcludeDispatchKeyGuard exclude_guard(after_Python_keyset); const auto& schema = op.schema(); const auto num_arguments = schema.arguments().size(); +======= + c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len(); if (mode_stack_len > 0) { +<<<<<<< HEAD RECORD_FUNCTION("PythonDispatchMode", torch::jit::last(*stack, num_arguments)); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1); cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack); return; } +<<<<<<< HEAD RECORD_FUNCTION("PythonSubclass", torch::jit::last(*stack, num_arguments)); // Otherwise, find a PyInterpreter on a Tensor +======= + // Otherwise, find a PyInterpreter on a Tensor + const auto& schema = op.schema(); + const auto num_arguments = schema.arguments().size(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // It is safe to dispatch on the very first Tensor with a pyobj_interpreter // without checking the interpreters of any of the arguments, because when // we actually run dispatch(), we will take out PyObjects in the context diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h index 83b39de34d782..6bf76c229fe44 100644 --- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h +++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h @@ -13,7 +13,11 @@ class TORCH_API PythonOpRegistrationTrampoline final { public: // Returns true if you successfully registered yourself (that means // you are in the hot seat for doing the operator registrations!) +<<<<<<< HEAD static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/); +======= + static bool registerInterpreter(c10::impl::PyInterpreter*); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Returns nullptr if no interpreter has been registered yet. static c10::impl::PyInterpreter* getInterpreter(); diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index c5f887f096cd1..650b338000b01 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -138,7 +138,11 @@ void Tensor::_backward(TensorList inputs, const std::optional& gradient, std::optional keep_graph, bool create_graph) const { +<<<<<<< HEAD impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph); +======= + return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const { @@ -173,6 +177,7 @@ unsigned TensorBase::_register_hook(std::function return impl::GetVariableHooks()->_register_hook(*this, std::move(hook)); } +<<<<<<< HEAD std::optional TensorBase::grad_dtype() const { return impl::GetVariableHooks()->grad_dtype(*this); } @@ -181,4 +186,6 @@ void TensorBase::set_grad_dtype(const std::optional& grad_dtype) con return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 2b9558197bdcb..8da2cfcbbbe6a 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -1,5 +1,6 @@ #pragma once +<<<<<<< HEAD // See https://github.com/pytorch/pytorch/issues/161660 // This compile flag is intended to be passed in to CppExtensions that rely on // the stable ABI via the `extra_compile_args` argument. This is a stopgap @@ -13,6 +14,8 @@ "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed" #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -100,7 +103,11 @@ class TORCH_API TensorBase { // Create a Tensor with a +0 reference count. Special care must be // taken to avoid decrementing this reference count at destruction // time. Intended to support MaybeOwnedTraits. +<<<<<<< HEAD explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs) +======= + explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : impl_(c10::intrusive_ptr(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {} friend MaybeOwnedTraits; @@ -111,7 +118,13 @@ class TORCH_API TensorBase { explicit TensorBase( c10::intrusive_ptr tensor_impl) : impl_(std::move(tensor_impl)) { +<<<<<<< HEAD TORCH_CHECK(impl_.get(), "TensorImpl with nullptr is not supported"); +======= + if (impl_.get() == nullptr) { + throw std::runtime_error("TensorImpl with nullptr is not supported"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } TensorBase(const TensorBase&) = default; TensorBase(TensorBase&&) noexcept = default; @@ -135,7 +148,11 @@ class TORCH_API TensorBase { } TensorBase contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const { +<<<<<<< HEAD if (is_contiguous_or_false(memory_format)) { +======= + if (is_contiguous(memory_format)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return *this; } else { return __dispatch_contiguous(memory_format); @@ -276,6 +293,7 @@ class TORCH_API TensorBase { return impl_->is_contiguous(memory_format); } +<<<<<<< HEAD // Like is_contiguous, but more dynamic shape-friendly. May return a symbolic representation of // contiguity instead of SymTrue SymFalse, when results are data-dependent. c10::SymBool sym_is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const { @@ -295,6 +313,8 @@ class TORCH_API TensorBase { return impl_->is_contiguous(memory_format); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool is_non_overlapping_and_dense() const { return impl_->is_non_overlapping_and_dense(); } @@ -928,10 +948,13 @@ class TORCH_API TensorBase { const TensorBase& requires_grad_(bool _requires_grad=true) const; +<<<<<<< HEAD std::optional grad_dtype() const; void set_grad_dtype(const std::optional& grad_dtype) const; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // View Variables //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -952,7 +975,11 @@ class TORCH_API TensorBase { c10::intrusive_ptr impl_; private: +<<<<<<< HEAD TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const; +======= + TensorBase __dispatch_contiguous(c10::MemoryFormat) const; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; inline DeviceIndex get_device(const TensorBase& self) { diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h index dad18bd019bbe..83a8260bd56c6 100644 --- a/aten/src/ATen/core/TransformationHelper.h +++ b/aten/src/ATen/core/TransformationHelper.h @@ -117,7 +117,11 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) { template <> C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) { // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function +<<<<<<< HEAD return median + sigma * at::tan(c10::pi * (val - 0.5)); +======= + return median + sigma * at::tan(c10::pi * (val - static_cast(0.5))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /** diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp index dad3f090bb1ea..fdc8bf6e6e95a 100644 --- a/aten/src/ATen/core/VariableFallbackKernel.cpp +++ b/aten/src/ATen/core/VariableFallbackKernel.cpp @@ -109,10 +109,13 @@ TORCH_LIBRARY_IMPL(_, AutogradHPU, m) { m.fallback(AUTOGRAD_FALLBACK); } +<<<<<<< HEAD TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) { m.fallback(AUTOGRAD_FALLBACK); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #undef AUTOGRAD_FALLBACK } // namespace diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h index c0f270700e3ce..8792156ac6064 100644 --- a/aten/src/ATen/core/VariableHooksInterface.h +++ b/aten/src/ATen/core/VariableHooksInterface.h @@ -68,8 +68,11 @@ struct TORCH_API VariableHooksInterface { const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) const = 0; +<<<<<<< HEAD virtual std::optional grad_dtype(const TensorBase&) const = 0; virtual void set_grad_dtype(const TensorBase&, const std::optional&) const = 0; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; TORCH_API void SetVariableHooks(VariableHooksInterface* hooks); diff --git a/aten/src/ATen/core/boxing/BoxedKernel.h b/aten/src/ATen/core/boxing/BoxedKernel.h index c5e46d8de000d..7abdc4b22c9a9 100644 --- a/aten/src/ATen/core/boxing/BoxedKernel.h +++ b/aten/src/ATen/core/boxing/BoxedKernel.h @@ -18,10 +18,17 @@ class KernelFunction; // implementation notes; notably, this does NOT actually go through the // boxing/unboxing codepath. TORCH_API void fallthrough_kernel( +<<<<<<< HEAD OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/); +======= + OperatorKernel*, + const OperatorHandle&, + DispatchKeySet, + Stack*); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [Ambiguity in AutogradOther kernel] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,10 +69,17 @@ TORCH_API void fallthrough_kernel( // than arbitrarily pick one or the other, we just register a kernel that raises // an error and let the user decide how to proceed. TORCH_API void ambiguous_autogradother_kernel( +<<<<<<< HEAD OperatorKernel* /*unused*/, const OperatorHandle& /*op*/, DispatchKeySet /*unused*/, Stack* /*unused*/); +======= + OperatorKernel*, + const OperatorHandle&, + DispatchKeySet, + Stack*); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note [named_not_supported_kernel] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -75,10 +89,17 @@ TORCH_API void ambiguous_autogradother_kernel( // give a good error message in cases when boxing is not supported). When // boxing is universally supported this can be removed. [[noreturn]] TORCH_API void named_not_supported_kernel( +<<<<<<< HEAD OperatorKernel* /*unused*/, const OperatorHandle& /*op*/, DispatchKeySet /*unused*/, Stack* /*unused*/); +======= + OperatorKernel*, + const OperatorHandle&, + DispatchKeySet, + Stack*); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /** * BoxedKernel is similar to a std::function storing a boxed kernel. @@ -185,16 +206,28 @@ class TORCH_API BoxedKernel final { template static void make_boxed_function( +<<<<<<< HEAD OperatorKernel* /*unused*/, const OperatorHandle& opHandle, DispatchKeySet /*unused*/, +======= + OperatorKernel*, + const OperatorHandle& opHandle, + DispatchKeySet, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Stack* stack); template static void make_boxed_function( +<<<<<<< HEAD OperatorKernel* /*unused*/, const OperatorHandle& opHandle, DispatchKeySet /*ks*/, +======= + OperatorKernel*, + const OperatorHandle& opHandle, + DispatchKeySet, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Stack* stack); explicit BoxedKernel( diff --git a/aten/src/ATen/core/boxing/BoxedKernel_impl.h b/aten/src/ATen/core/boxing/BoxedKernel_impl.h index 04ba1368f070a..ef94a608e00c4 100644 --- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h +++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h @@ -2,7 +2,11 @@ namespace c10 { +<<<<<<< HEAD inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {} +======= +inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline BoxedKernel::BoxedKernel( std::unique_ptr functor, @@ -11,9 +15,15 @@ inline BoxedKernel::BoxedKernel( template inline void BoxedKernel::make_boxed_function( +<<<<<<< HEAD OperatorKernel* /*unused*/, const OperatorHandle& opHandle, DispatchKeySet /*unused*/, +======= + OperatorKernel*, + const OperatorHandle& opHandle, + DispatchKeySet, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Stack* stack) { // Note that we're dropping the DispatchKeySet argument. // See Note [Plumbing Keys Through The Dispatcher 2] for details. @@ -22,7 +32,11 @@ inline void BoxedKernel::make_boxed_function( template inline void BoxedKernel::make_boxed_function( +<<<<<<< HEAD OperatorKernel* /*unused*/, +======= + OperatorKernel*, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const OperatorHandle& opHandle, DispatchKeySet ks, Stack* stack) { diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index dd2fb32e6817d..1b1d2e752c2e2 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -10,7 +10,11 @@ namespace c10 { // be handled specially. Its semantics is that it redispatches to the // *next* dispatch key that would have been processed, skipping the current // one. +<<<<<<< HEAD void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) { +======= +void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(0, "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. " "This could occur if you registered a fallthrough kernel as a override for a specific operator " @@ -19,7 +23,11 @@ void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unus "let us know in the bug tracker."); } +<<<<<<< HEAD void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) { +======= +void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT(0, op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. " "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering " @@ -32,7 +40,11 @@ void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHa "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n"); } +<<<<<<< HEAD void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) { +======= +void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING // See Note [named_not_supported_kernel] TORCH_CHECK(0, diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index 0314dcd9903e7..f2aec1977d2db 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -218,7 +218,11 @@ class TORCH_API KernelFunction final { * &unboxed_func>(); */ template +<<<<<<< HEAD static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/); +======= + static KernelFunction makeFromUnboxedFunction(FuncPtr); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /** * Create a KernelFunction from an unboxed function. @@ -260,7 +264,11 @@ class TORCH_API KernelFunction final { std::string dumpState() const; // For testing internal invariants only +<<<<<<< HEAD bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const; +======= + bool _equalsBoxedAndUnboxed(const KernelFunction&) const; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) private: explicit KernelFunction( diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index 5b645506206f9..5077e03e18af6 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -15,12 +15,22 @@ std::enable_if_t< std::is_base_of_v, std::unique_ptr> make_unique_base(Args&&... args) { +<<<<<<< HEAD return std::make_unique(std::forward(args)...); +======= + return std::unique_ptr(new Child(std::forward(args)...)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace detail inline KernelFunction::KernelFunction() +<<<<<<< HEAD : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {} +======= + : boxed_kernel_func_(), + unboxed_kernel_func_(nullptr), + sym_unboxed_kernel_func_(nullptr) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline KernelFunction::KernelFunction( std::unique_ptr functor, diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h index 7fbc3b982609f..8c0cf276d9075 100644 --- a/aten/src/ATen/core/boxing/impl/boxing.h +++ b/aten/src/ATen/core/boxing/impl/boxing.h @@ -131,7 +131,11 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack( new (dest++) IValue(options.pinned_memory()); } +<<<<<<< HEAD inline void boxArgsToStack(IValue*& /*unused*/) {} +======= +inline void boxArgsToStack(IValue*&) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack( @@ -185,7 +189,11 @@ struct PopResult> final { template static Result pop_to_tuple_impl( Stack& stack, +<<<<<<< HEAD std::index_sequence /*unused*/) { +======= + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::make_tuple((std::move(stack[indices]).template to())...); } }; diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index 34b1514f32cdb..8f8ba4a558f98 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -105,7 +105,11 @@ using supported_primitive_arg_types = guts::typelist::typelist< // So a valid input type is one that our boxed functor wrapper can // unbox from an IValue into a C++ value. // +<<<<<<< HEAD // Whereas a valid output type is one that our wrapper can receive +======= +// Whereas a valid output type is one that our wrapper can recieve +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // as a C++ value from the unboxed functor, and box into an IValue. // @@ -561,7 +565,11 @@ struct wrap_kernel_functor_unboxed_< // doesn't use && static ReturnType call( OperatorKernel* functor, +<<<<<<< HEAD DispatchKeySet /*unused*/, +======= + DispatchKeySet, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ParameterTypes... args) { KernelFunctor* functor_ = static_cast(functor); // Note [Plumbing Keys Through The Dispatcher 2] @@ -629,8 +637,13 @@ call_functor_with_args_from_stack_( OperatorKernel* functor, DispatchKeySet dispatchKeySet, Stack* stack, +<<<<<<< HEAD std::index_sequence /*unused*/, guts::typelist::typelist* /*unused*/) { +======= + std::index_sequence, + guts::typelist::typelist*) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would // be unused and we have to silence the compiler warning. @@ -708,7 +721,11 @@ struct push_outputs, AllowDeprecatedTypes> final { static void call_( std::tuple&& output, Stack* stack, +<<<<<<< HEAD std::index_sequence /*unused*/) { +======= + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch::jit::push( *stack, return_to_ivalue::call( @@ -718,7 +735,11 @@ struct push_outputs, AllowDeprecatedTypes> final { static void copy_( const std::tuple& output, Stack* stack, +<<<<<<< HEAD std::index_sequence /*unused*/) { +======= + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch::jit::push( *stack, return_to_ivalue::copy( @@ -741,7 +762,11 @@ struct make_boxed_from_unboxed_functor final { static void call( OperatorKernel* functor, +<<<<<<< HEAD const OperatorHandle& /*unused*/, +======= + const OperatorHandle&, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DispatchKeySet dispatchKeySet, Stack* stack) { using ReturnType = diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index 8c837871dff7d..fddc5f844c787 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -63,13 +63,22 @@ struct BuiltinOpFunction : public Function { bool call( Stack& stack, +<<<<<<< HEAD std::optional /*unused*/, c10::function_ref /*unused*/) override { +======= + std::optional, + c10::function_ref) override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) run(stack); return false; } +<<<<<<< HEAD bool call(Stack& stack, c10::function_ref /*unused*/) +======= + bool call(Stack& stack, c10::function_ref) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) override { run(stack); return false; diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h index ea537400ef73d..2524980cc4988 100644 --- a/aten/src/ATen/core/class_type.h +++ b/aten/src/ATen/core/class_type.h @@ -148,7 +148,11 @@ struct TORCH_API ClassType : public NamedType { void checkNotExist(const std::string& name, const std::string& what) const; +<<<<<<< HEAD // Attributes are stored in a specific slot at runtime for efficiency. +======= + // Attributes are stored in a specific slot at runtime for effiency. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // When emitting instructions we specify the slot so that attribute access is // a constant lookup std::optional findAttributeSlot(const std::string& name) const { @@ -412,7 +416,11 @@ struct TORCH_API ClassType : public NamedType { // Holds method attributes std::weak_ptr compilation_unit_; +<<<<<<< HEAD // Holds all attributes, attribute details are found on ClassAttribute +======= + // Holds all atrributes, attribute details are found on ClassAttribute +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::vector attributes_; // Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef. // Never fill this without using the appropriate provideNewClassAttribute method diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index dbd00e9c52909..1e32f59b5302f 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -80,8 +80,12 @@ struct MultiDispatchKeySet : at::IterArgs { ts = ts | x.key_set(); } } +<<<<<<< HEAD [[noreturn]] void operator()( at::ArrayRef> /*unused*/) { +======= + [[noreturn]] void operator()(at::ArrayRef>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Just checking that the handling of Tensor?[] didn't change. TORCH_INTERNAL_ASSERT(false); } @@ -96,7 +100,11 @@ struct MultiDispatchKeySet : at::IterArgs { } } template +<<<<<<< HEAD void operator()(const T& /*unused*/) { +======= + void operator()(const T&) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // do nothing } }; diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index afcaf51f231ae..8a50f1d44aee4 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -76,7 +76,17 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name, OpRegistrationListener::~OpRegistrationListener()= default; +<<<<<<< HEAD Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique()), guard_(std::make_shared()) +======= +Dispatcher::Dispatcher() +: operators_() +, operatorLookupTable_() +, backendFallbackKernels_() +, listeners_(std::make_unique()) +, cond_var_() +, guard_(std::make_shared()) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {} Dispatcher::~Dispatcher() { @@ -442,6 +452,7 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); TORCH_CHECK(idx >= 0 && static_cast(idx) < backendFallbackKernels_.size(), "idx=", idx); +<<<<<<< HEAD // NB: Perserve BC for registering fallback for AutogradPrivateUse1 multiple time, // refer to https://github.com/pytorch/pytorch/issues/163979 for more informations. TORCH_CHECK( @@ -453,6 +464,13 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker backendFallbackKernels_[idx].debug, ", new registration ", debug); +======= + TORCH_CHECK( + !backendFallbackKernels_[idx].kernel.isValid(), + "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ", + backendFallbackKernels_[idx].debug, ", new registration ", debug + ); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NB: inferred function schema is always nullptr for fallbacks, as fallbacks // cannot be unboxed backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); @@ -537,7 +555,11 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK // Note: this records a sequence number for both Autograd keys, and for // non-Autograd keys where the dispatchKeySet still contains an autograd key. +<<<<<<< HEAD // This means that we might collect the same sequence number two different +======= + // This means that we might collect the same sequence nubmer two different +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // events if they all occurred above Autograd and still had the Autograd // dispatch key in the dispatch key set. // However, this usually doesn't happen: normally the first call will @@ -568,9 +590,15 @@ bool Dispatcher::profilingOperatorEvents() { return TORCH_SDT_IS_ENABLED(operator_start) || TORCH_SDT_IS_ENABLED(operator_end); } +<<<<<<< HEAD C10_NOINLINE void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref, std::vector& argsAddresses, std::vector& argsTypes) { if (TORCH_SDT_IS_ENABLED(operator_start)) { TORCH_SDT_WITH_SEMAPHORE(operator_start, schema_ref.get().name().c_str(), argsAddresses.size(), argsAddresses.data(), argsTypes.data()); +======= +C10_NOINLINE void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref) { + if (TORCH_SDT_IS_ENABLED(operator_start)) { + TORCH_SDT_WITH_SEMAPHORE(operator_start, schema_ref.get().name().c_str()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 79a8e28d88b64..6ef75c054f58d 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -96,7 +96,11 @@ class TORCH_API Dispatcher final { friend class TypedOperatorHandle; struct Guard final { +<<<<<<< HEAD Guard() : alive(true) {} +======= + Guard() : alive(true), mutex() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::atomic alive; std::mutex mutex; }; @@ -371,10 +375,14 @@ class TORCH_API Dispatcher final { #ifdef FBCODE_CAFFE2 static bool profilingOperatorEvents(); +<<<<<<< HEAD static void fireOpStartUSDT( at::RecordFunction::schema_ref_t schema_ref, std::vector& argsAddresses, std::vector& argsTypes); +======= + static void fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref); #endif // FBCODE_CAFFE2 @@ -492,7 +500,11 @@ class TORCH_API OperatorHandle { } void checkInvariants() const { +<<<<<<< HEAD operatorDef_->op.checkInvariants(); +======= + return operatorDef_->op.checkInvariants(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } c10::ArrayRef getTags() const { @@ -581,7 +593,11 @@ class TORCH_API OperatorHandle { // We need to store this iterator in order to make // Dispatcher::cleanup() fast -- it runs a lot on program +<<<<<<< HEAD // termination (and presumably library unloading). +======= + // termination (and presuambly library unloading). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::list::iterator operatorIterator_; }; @@ -629,7 +645,11 @@ class TypedOperatorHandle final : public OperatorHandle { namespace detail { template +<<<<<<< HEAD inline void unused_arg_(const Args&... /*unused*/) {} +======= +inline void unused_arg_(const Args&...) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // CaptureKernelCall is intended to capture return values from Dispatcher // unboxed kernel calls. A record function may request to get outputs from the @@ -798,6 +818,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call( #ifdef FBCODE_CAFFE2 if (profilingOperatorEvents()) { +<<<<<<< HEAD std::vector argsAddresses = {(void*)(&args)...}; std::vector argsTypes = {(typeid(args).name())...}; struct FireOpRAII { @@ -807,12 +828,22 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call( std::vector& argsTypes) : schema_ref_(schema_ref) { fireOpStartUSDT(schema_ref, argsAddresses, argsTypes); +======= + struct FireOpRAII { + FireOpRAII(at::RecordFunction::schema_ref_t schema_ref) + : schema_ref_(schema_ref) { + fireOpStartUSDT(schema_ref); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } ~FireOpRAII() { fireOpEndUSDT(schema_ref_); } at::RecordFunction::schema_ref_t schema_ref_; +<<<<<<< HEAD } event(op.schema(), argsAddresses, argsTypes); +======= + } event(op.schema()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return kernel.template call( op, dispatchKeySet, std::forward(args)...); } else { @@ -928,7 +959,11 @@ inline void Dispatcher::redispatchBoxed( } #endif const auto& kernel = entry.lookup(dispatchKeySet); +<<<<<<< HEAD kernel.callBoxed(op, dispatchKeySet, stack); +======= + return kernel.callBoxed(op, dispatchKeySet, stack); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace c10 diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index add1ba059ea8a..adfcf1e312186 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -62,7 +62,21 @@ static const auto& getDispatchTableIndexToKey() { } OperatorEntry::OperatorEntry(OperatorName&& operator_name) +<<<<<<< HEAD : name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_)) +======= +: name_(std::move(operator_name)) +, schema_() +#ifndef C10_MOBILE +, tags_() +#endif +, dispatchTable_() +, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()) +, kernels_() +, cpp_signature_() +, sym_cpp_signature_() +, is_observed_(ObservedOperators::isObserved(name_)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { // Pick up any backend fallbacks that were registered prior to this // OperatorEntry being created. @@ -329,7 +343,11 @@ std::pair OperatorEntry::computeDispatchTab // For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration // to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd]. // For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of +<<<<<<< HEAD // its backends and ask backend extender to request a dedicated Autograd key for the backend. +======= + // its backends and ask backend extender to request a decicated Autograd key for the backend. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See Note [Ambiguity in AutogradOther kernel] for more details. // A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't // cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available) diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index 176b16f7265fe..3a714c9193677 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -105,7 +105,11 @@ class TORCH_API OperatorEntry final { // versa that is an error. (Refcounting for the registrations is // handled in the OperatorHandle in Dispatcher) void registerSchema( +<<<<<<< HEAD FunctionSchema&& /*schema*/, +======= + FunctionSchema&&, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::string&& debug, std::vector tags = {}); void deregisterSchema(); diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp index 2b1a32bd0ac8a..a52cd5a67514b 100644 --- a/aten/src/ATen/core/dynamic_type.cpp +++ b/aten/src/ATen/core/dynamic_type.cpp @@ -177,7 +177,11 @@ bool DynamicType::equals(const Type& rhs) const { return equals(*create(rhs)); } +<<<<<<< HEAD bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const { +======= +bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto other = create(rhs); if (tag_ == other->tag_) { if (equals(*other)) { @@ -371,7 +375,11 @@ DynamicTypePtr ivalue::TupleTypeFactory::create( } DynamicTypePtr ivalue::TupleTypeFactory::fallback( +<<<<<<< HEAD const Type& /*unused*/) { +======= + const Type&) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return nullptr; } diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index ee0d077e5c51a..61f47bde598ef 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -64,7 +64,10 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10); _(ScalarType, kDynamicIntTypeBit, 1) \ _(Layout, kDynamicIntTypeBit, 1) \ _(SymInt, kDynamicIntTypeBit, 1) \ +<<<<<<< HEAD _(SymBool, kDynamicIntTypeBit, 1) \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _(MemoryFormat, kDynamicIntTypeBit, 1) #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type; @@ -138,8 +141,13 @@ class DynamicType : public SharedType { struct Arguments { Arguments() = default; +<<<<<<< HEAD Arguments(c10::ArrayRef /*args*/); Arguments(const std::vector& /*names*/, c10::ArrayRef /*args*/); +======= + Arguments(c10::ArrayRef); + Arguments(const std::vector&, c10::ArrayRef); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::vector elems; }; @@ -156,15 +164,24 @@ class DynamicType : public SharedType { static const TypeKind Kind = TypeKind::DynamicType; static TORCH_API DynamicTypePtr create(Type& ty); +<<<<<<< HEAD explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/); explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/); +======= + explicit DynamicType(Tag, Arguments); + explicit DynamicType(Tag, std::string_view, Arguments); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DynamicType(DynamicType&& other) = delete; DynamicType(const DynamicType&) = delete; DynamicType& operator=(const DynamicType&) = delete; DynamicType& operator=(DynamicType&&) = delete; +<<<<<<< HEAD TypePtr containedType(size_t /*i*/) const override; +======= + TypePtr containedType(size_t) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) size_t containedTypeSize() const override; Tag tag() const { return tag_; diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h index 83db2ec9d71df..c4f48ed8cd3ce 100644 --- a/aten/src/ATen/core/function.h +++ b/aten/src/ATen/core/function.h @@ -96,15 +96,25 @@ struct TORCH_API Function { // Overload for server interpreter, a bailout size is needed for graph // executor. virtual bool call( +<<<<<<< HEAD Stack& /*unused*/, std::optional /*unused*/, c10::function_ref /*unused*/) { +======= + Stack&, + std::optional, + c10::function_ref) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; } // Overload for mobile interpreter. +<<<<<<< HEAD virtual bool call(Stack& /*unused*/, c10::function_ref /*unused*/) { +======= + virtual bool call(Stack&, c10::function_ref) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; } diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp index 6587af0f9ccc0..b0e009aa07e58 100644 --- a/aten/src/ATen/core/function_schema.cpp +++ b/aten/src/ATen/core/function_schema.cpp @@ -261,7 +261,11 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) { // // There are 2 cases // 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'. +<<<<<<< HEAD // without the extra parenthesis, the c++ scheme parser can not parse it. +======= + // without the extra parenthesis, the c++ schem parser can not parse it. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // 2. something like '-> ((str, str))'. Need extra parenthesis so the return // type is a single tuple rather than two strings. // PR (https://github.com/pytorch/pytorch/pull/23204) has more context about diff --git a/aten/src/ATen/core/interned_strings.cpp b/aten/src/ATen/core/interned_strings.cpp index 799f6821bb928..5c44c1122edae 100644 --- a/aten/src/ATen/core/interned_strings.cpp +++ b/aten/src/ATen/core/interned_strings.cpp @@ -68,7 +68,15 @@ Symbol InternedStrings::_symbol(const std::string& s) { return it->second; auto pos = s.find("::"); +<<<<<<< HEAD TORCH_CHECK(pos != std::string::npos, "all symbols must have a namespace, ::, but found: ", s); +======= + if (pos == std::string::npos) { + std::stringstream ss; + ss << "all symbols must have a namespace, ::, but found: " << s; + throw std::runtime_error(ss.str()); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Symbol ns = _symbol("namespaces::" + s.substr(0, pos)); Symbol sym(sym_to_info_.size()); @@ -117,7 +125,16 @@ std::string Symbol::domainString() const { } Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) { +<<<<<<< HEAD TORCH_CHECK(d.compare(0, domain_prefix().size(), domain_prefix()) == 0, "Symbol: domain string is expected to be prefixed with '", domain_prefix(), "', e.g. 'org.pytorch.aten'"); +======= + if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) { + std::ostringstream ss; + ss << "Symbol: domain string is expected to be prefixed with '" + << domain_prefix() << "', e.g. 'org.pytorch.aten'"; + throw std::runtime_error(ss.str()); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::string qualString = d.substr(domain_prefix().size()) + "::" + s; return fromQualString(qualString); } diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index bb01c47e055a8..b0a6cf0ebcbcd 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -7,7 +7,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -98,8 +101,11 @@ c10::TypePtr IValue::TagType::get(const IValue& v) { return ComplexType::get(); case Tag::Int: return IntType::get(); +<<<<<<< HEAD case Tag::UInt: return IntType::get(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case Tag::SymInt: return c10::SymIntType::get(); case Tag::SymFloat: @@ -323,8 +329,11 @@ IValue IValue::equals(const IValue& rhs) const { return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble(); case Tag::Int: return rhs.isInt() && lhs.toInt() == rhs.toInt(); +<<<<<<< HEAD case Tag::UInt: return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case Tag::SymInt: return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt(); case Tag::SymFloat: @@ -358,7 +367,11 @@ IValue IValue::equals(const IValue& rhs) const { case Tag::Enum: return lhs.toEnumHolder()->is(*rhs.toEnumHolder()); case Tag::Uninitialized: +<<<<<<< HEAD // Uninitialized ivalues show up in no-ops when the compiler can prove a +======= + // Unitialized ivalues show up in no-ops when the compiler can prove a +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // value will never be used. Just return false on any equality comparison. return false; } @@ -384,8 +397,11 @@ size_t IValue::hash(const IValue& v) { case Tag::Int: return c10::get_hash(v.payload.u.as_int); // NB: these are technically strict aliasing violations +<<<<<<< HEAD case Tag::UInt: return c10::get_hash(v.payload.u.as_int); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case Tag::SymInt: return c10::get_hash(v.payload.u.as_int); case Tag::SymFloat: @@ -413,7 +429,11 @@ size_t IValue::hash(const IValue& v) { case Tag::Enum: case Tag::Stream: case Tag::Uninitialized: +<<<<<<< HEAD TORCH_CHECK(false, +======= + throw std::runtime_error( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "unhashable type: '" + v.type()->repr_str() + "'"); } // the above switch should be exhaustive @@ -813,8 +833,11 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) { return printComplex(out, v); } case IValue::Tag::Int: return out << v.toInt(); +<<<<<<< HEAD case IValue::Tag::UInt: return out << v.toUInt(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case IValue::Tag::SymInt: return out << v.toSymInt(); case IValue::Tag::SymFloat: diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index f13b0613691b4..4041e8b198b8a 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -12,7 +12,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -161,7 +164,10 @@ struct Capsule { _(Double) \ _(ComplexDouble) \ _(Int) \ +<<<<<<< HEAD _(UInt) \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _(SymInt) \ _(SymFloat) \ _(SymBool) \ @@ -624,6 +630,7 @@ struct TORCH_API IValue final { IValue(const c10::SymBool& i) { if (auto mi = i.maybe_as_bool()) { tag = Tag::Bool; +<<<<<<< HEAD #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ payload.u.as_int = *mi; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -632,6 +639,9 @@ struct TORCH_API IValue final { #else #error Unexpected or undefined __BYTE_ORDER__ #endif +======= + payload.u.as_int = *mi; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { tag = Tag::SymBool; payload.u.as_intrusive_ptr = i.toSymNodeImpl().release(); @@ -662,6 +672,7 @@ struct TORCH_API IValue final { } } +<<<<<<< HEAD // Unsigned IValue(uint64_t u) : tag( u <= std::numeric_limits::max() ? Tag::Int : Tag::UInt) { payload.u.as_uint = u; @@ -685,6 +696,8 @@ struct TORCH_API IValue final { } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Bool IValue(bool b) : tag(Tag::Bool) { #if defined(__clang__) && defined(__x86_64__) @@ -854,7 +867,11 @@ struct TORCH_API IValue final { IValue(std::optional v); template = nullptr> IValue(c10::OptionalArrayRef v); +<<<<<<< HEAD IValue(std::nullopt_t /*unused*/); +======= + IValue(std::nullopt_t); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // ClassType IValue(c10::intrusive_ptr v); @@ -925,6 +942,7 @@ struct TORCH_API IValue final { } else { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( s.isIntegral(false), "Unknown type in Scalar"); +<<<<<<< HEAD if (s.isUnsigned()) { const auto val = s.toUInt64(); payload.u.as_uint = val; @@ -933,6 +951,10 @@ struct TORCH_API IValue final { payload.u.as_int = s.toLong(); tag = Tag::Int; } +======= + tag = Tag::Int; + payload.u.as_int = s.toLong(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } @@ -956,8 +978,11 @@ struct TORCH_API IValue final { return toSymFloat(); else if (isSymBool()) return toSymBool(); +<<<<<<< HEAD else if (isUnsigned()) return toUInt(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "IValue is not a Scalar"); } @@ -1176,7 +1201,11 @@ struct TORCH_API IValue final { using HashIdentityIValueMap = std::unordered_map; +<<<<<<< HEAD // Checks if this and rhs has a subvalues in common. +======= + // Chechs if this and rhs has a subvalues in common. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // [t1,t2] and [t2, t3] returns true. bool overlaps(const IValue& rhs) const; @@ -1287,8 +1316,11 @@ struct TORCH_API IValue final { return true; case Tag::Int: return false; +<<<<<<< HEAD case Tag::UInt: return false; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case Tag::SymInt: return true; case Tag::SymFloat: @@ -1385,8 +1417,11 @@ struct TORCH_API IValue final { union TriviallyCopyablePayload { TriviallyCopyablePayload() : as_int(0) {} int64_t as_int; +<<<<<<< HEAD // See Note [Meaning of HAS_u] uint64_t as_uint; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) double as_double; bool as_bool; // Invariant: never nullptr; null state is represented as diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 8d1c3aa83dadb..418a2700820f2 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -660,7 +660,11 @@ struct TORCH_API TupleTypeFactory { template <> struct TORCH_API TupleTypeFactory { static DynamicTypePtr create(const std::vector& elemTypes); +<<<<<<< HEAD static DynamicTypePtr fallback(const Type& /*unused*/); +======= + static DynamicTypePtr fallback(const Type&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct TORCH_API Tuple : c10::intrusive_ptr_target { @@ -1501,7 +1505,11 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target { // However, the CompilationUnit holds ownership of the type's graphs, so // inserting a constant object into a Graph would create a reference cycle if // that constant object held a shared_ptr to its CU. For these objects we +<<<<<<< HEAD // instantiate them with non-owning references to its CU +======= + // instatiate them with non-owning references to its CU +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) { slots_.resize(numSlots); } @@ -1682,7 +1690,11 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target { namespace detail { struct _guarded_unsigned_long_unique_dummy final { +<<<<<<< HEAD _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){} +======= + _guarded_unsigned_long_unique_dummy(int64_t){} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; using _guarded_unsigned_long = std::conditional_t< std::is_same_v || @@ -1776,7 +1788,11 @@ template // native_functions.yaml still return std::vector. // C10_DEPRECATED_MESSAGE("IValues based on std::vector are potentially slow // and deprecated. Please use torch::List instead.") +<<<<<<< HEAD std::vector generic_to(IValue ivalue, _fake_type> /*unused*/) { +======= +std::vector generic_to(IValue ivalue, _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We need to do a deep copy of the vector because there might be other // references to this same IValue that also use the list. We can't just // move the elements out. @@ -1826,18 +1842,30 @@ c10::intrusive_ptr IValue::toCustomClass() const& { } template +<<<<<<< HEAD T generic_to(IValue ivalue, _fake_type /*unused*/) { +======= +T generic_to(IValue ivalue, _fake_type) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using ElemType = typename std::remove_pointer::type::element_type; return std::move(ivalue).template toCustomClass(); } template +<<<<<<< HEAD tagged_capsule generic_to(IValue ivalue, _fake_type> /*unused*/) { +======= +tagged_capsule generic_to(IValue ivalue, _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return tagged_capsule{std::move(ivalue)}; } template +<<<<<<< HEAD c10::List generic_to(IValue ivalue, _fake_type> /*unused*/) { +======= +c10::List generic_to(IValue ivalue, _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return impl::toTypedList(std::move(ivalue).toList()); } @@ -1867,7 +1895,11 @@ std::vector createVectorFromList(const c10::List& impl) { } template +<<<<<<< HEAD OptionalArray generic_to(IValue ivalue, _fake_type> /*unused*/) { +======= +OptionalArray generic_to(IValue ivalue, _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (ivalue.isNone()) { return {}; } @@ -1880,8 +1912,13 @@ namespace detail { template std::array generic_to_array( IValue ivalue, +<<<<<<< HEAD _fake_type> /*unused*/, std::index_sequence /*unused*/) { +======= + _fake_type>, + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We need to do a deep copy of the array because there might be other // references to this same IValue that also use the list. We can't just // move the elements out. @@ -1906,7 +1943,11 @@ std::array generic_to( template c10::Dict generic_to( IValue ivalue, +<<<<<<< HEAD _fake_type> /*unused*/) { +======= + _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return impl::toTypedDict(std::move(ivalue).toGenericDict()); } @@ -1915,7 +1956,11 @@ C10_DEPRECATED_MESSAGE( "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict instead.") std::unordered_map generic_to( IValue ivalue, +<<<<<<< HEAD _fake_type> /*unused*/) { +======= + _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::unordered_map specialized_dict; for (const auto& item : std::move(ivalue).toGenericDict()) { @@ -1926,7 +1971,11 @@ std::unordered_map generic_to( } template +<<<<<<< HEAD std::optional generic_to(IValue ivalue, _fake_type> /*unused*/) { +======= +std::optional generic_to(IValue ivalue, _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (ivalue.isNone()) { return std::nullopt; } @@ -1937,7 +1986,11 @@ namespace detail { template Tuple generic_to_tuple_impl( const ivalue::TupleElements& t, +<<<<<<< HEAD std::index_sequence /*unused*/) { +======= + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::make_tuple( t[INDEX].to::type>()...); } @@ -1951,7 +2004,11 @@ template < std::is_lvalue_reference..., std::negation>...>, std::nullptr_t> = nullptr> +<<<<<<< HEAD std::tuple generic_to(const IValue& ivalue, _fake_type> /*unused*/) { +======= +std::tuple generic_to(const IValue& ivalue, _fake_type>) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto& vals = ivalue.toTupleRef().elements(); TORCH_CHECK(vals.size() == sizeof...(Args)); return detail::generic_to_tuple_impl>(vals, Indices{}); @@ -2311,7 +2368,11 @@ inline IValue::IValue(std::optional v) : IValue() { } } +<<<<<<< HEAD inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {} +======= +inline IValue::IValue(std::nullopt_t) : IValue() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Object) { @@ -2482,6 +2543,7 @@ namespace ivalue { namespace detail { template +<<<<<<< HEAD IValue from_(T&& x, std::true_type /*unused*/) { return IValue(std::forward(x)); } @@ -2491,6 +2553,17 @@ IValue from_(c10::intrusive_ptr x, std::false_type /*unused*/) { } template IValue from_(T&& /*x*/, std::false_type /*unused*/) { +======= +IValue from_(T&& x, std::true_type) { + return IValue(std::forward(x)); +} +template +IValue from_(c10::intrusive_ptr x, std::false_type) { + return IValue(std::move(x)); +} +template +IValue from_(T&& /*x*/, std::false_type) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert( guts::false_t::value, "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)"); @@ -2546,19 +2619,31 @@ struct MaybeOwnedTraits { return &borrow; } +<<<<<<< HEAD static bool debugBorrowIsValid(const borrow_type& /*unused*/) { +======= + static bool debugBorrowIsValid(const borrow_type&) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } }; template <> struct IValue::TagType { +<<<<<<< HEAD static TORCH_API c10::TypePtr get(const IValue& /*v*/); +======= + static TORCH_API c10::TypePtr get(const IValue&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; template <> struct IValue::TagType { +<<<<<<< HEAD static TORCH_API c10::TypePtr get(const IValue& /*v*/); +======= + static TORCH_API c10::TypePtr get(const IValue&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; template diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 35c0d3530adcc..bceb3a3c7f14e 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -8,7 +8,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -117,8 +120,15 @@ struct SingleElementType : public SharedType { protected: SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) { +<<<<<<< HEAD TORCH_CHECK(this->elem, c10::str( "Can not create ", typeKindToString(Kind), " with None type")); +======= + if (!this->elem) { + throw std::runtime_error(c10::str( + "Can not create ", typeKindToString(Kind), " with None type")); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } private: @@ -373,7 +383,11 @@ struct TORCH_API SymbolicShape { // Unranked shape constructor. SymbolicShape() : dims_(std::nullopt) {} +<<<<<<< HEAD // Known rank but unknown dimensions. +======= + // Known rank but unknown dimentions. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) SymbolicShape(std::optional rank) : dims_(std::nullopt) { if(!rank) { return; @@ -415,12 +429,24 @@ struct TORCH_API SymbolicShape { } ShapeSymbol operator[](size_t i) const { +<<<<<<< HEAD TORCH_CHECK(dims_, "Rank isn't fixed"); +======= + if (!dims_) { + throw std::runtime_error("Rank isn't fixed"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*dims_).at(i); } ShapeSymbol at(size_t i) const { +<<<<<<< HEAD TORCH_CHECK(dims_, "Rank isn't fixed"); +======= + if (!dims_) { + throw std::runtime_error("Rank isn't fixed"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*dims_).at(i); } @@ -515,7 +541,13 @@ struct VaryingShape { } const std::optional &operator[](size_t i) const { +<<<<<<< HEAD TORCH_CHECK(dims_, "Rank isn't fixed"); +======= + if (!dims_) { + throw std::runtime_error("Rank isn't fixed"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (*dims_).at(i); } @@ -884,9 +916,15 @@ struct TORCH_API ListType // global singleton // Given an inner type T and an identifier, +<<<<<<< HEAD // this function will return the global singleton type pointer // the type List. // The extra "identifier" argument is needed because we have multiple container types +======= + // this function wil return the global singleton type pointer + // the type List. + // The extra "identifier" argument is needed beccause we have multiple container types +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // that all re-use this function (List, array, etc.) static TypePtr get(const std::string& identifier, TypePtr inner); @@ -950,7 +988,13 @@ struct TORCH_API DictType : public SharedType { TypePtr createWithContained( std::vector contained_types) const override { +<<<<<<< HEAD TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types"); +======= + if (contained_types.size() != 2) { + throw std::runtime_error("Expected 2 contained types"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return create(std::move(contained_types.at(0)), std::move(contained_types.at(1))); } @@ -1225,7 +1269,11 @@ struct TORCH_API TupleType : public NamedType { std::shared_ptr schema_; }; +<<<<<<< HEAD // the common supertype of all Enums, only used in operator registration. +======= +// the common supertype of all Enums, only used in operator registraion. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // EnumType <: AnyEnumType for all Enums struct AnyEnumType; using AnyEnumTypePtr = SingletonTypePtr; diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index 4db1cb18883be..53631dd03b2e9 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -185,11 +185,19 @@ struct TORCH_API Type { : repr_(nullptr) {} /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr p) +<<<<<<< HEAD : repr_(makeSingletonSharedPtr(p.get())) {} template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr p) : repr_(makeSingletonSharedPtr(static_cast(p.get()))) {} +======= + : repr_(p) {} + + template , bool> = true> + /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr p) + : repr_(SingletonTypePtr(p.get())) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We need to support construction from T* for pybind. The problem @@ -202,8 +210,13 @@ struct TORCH_API Type { // Case 2: if T is exactly Type, we need to do a dynamic_cast to // check if it's a SharedType and do the right thing. // +<<<<<<< HEAD // Case 3: Otherwise, T is not a SharedType. Use a singleton // pointer. +======= + // Case 3: Otherwise, T is not a SharedType. (debug-check this + // assumption!) Use a singleton pointer. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(T* p) : SingletonOrSharedTypePtr(static_cast::type>(p)->shared_from_this()) {} @@ -211,15 +224,25 @@ struct TORCH_API Type { template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(T* p) { if (auto* shared_p = dynamic_cast::type>(p)) { +<<<<<<< HEAD repr_ = shared_p->shared_from_this(); } else { repr_ = makeSingletonSharedPtr(p); +======= + repr_ = Repr(shared_p->shared_from_this()); + } else { + repr_ = Repr(p); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } template && !std::is_base_of_v, bool> = true> /* implicit */ SingletonOrSharedTypePtr(T* p) +<<<<<<< HEAD : repr_(makeSingletonSharedPtr(p)) { +======= + : repr_(p) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dynamic_cast::type>(p) == nullptr); } @@ -230,6 +253,7 @@ struct TORCH_API Type { ~SingletonOrSharedTypePtr() = default; T* get() const { +<<<<<<< HEAD return repr_.get(); } @@ -243,6 +267,21 @@ struct TORCH_API Type { bool operator!=(std::nullptr_t) const { return repr_ != nullptr; +======= + return repr_.isSharedAndNonNull() ? repr_.shared_.repr_.get() : static_cast(repr_.rawRepr().first); + } + + operator bool() const { + return repr_.isNonNull(); + } + + bool operator==(std::nullptr_t) const { + return !repr_.isNonNull(); + } + + bool operator!=(std::nullptr_t) const { + return repr_.isNonNull(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template , void>, bool> = true> @@ -255,6 +294,7 @@ struct TORCH_API Type { } private: +<<<<<<< HEAD // Use shared_ptr's aliasing constructor to create a non-owning pointer // to a singleton. The lifetime is tied to the null shared_ptr, so there's // no reference counting overhead for the singleton itself. @@ -263,6 +303,140 @@ struct TORCH_API Type { } std::shared_ptr repr_; +======= + // NOTE: SharedPtrWrapper exists to work around a baffling bug in + // nvcc; see comment in destroy() below. + struct SharedPtrWrapper { + SharedPtrWrapper(std::shared_ptr &&x) + : repr_(std::move(x)) {} + std::shared_ptr repr_; + }; + union Repr { + Repr() : Repr(nullptr) {} + + explicit Repr(std::shared_ptr x) + : shared_(std::move(x)) {} + + explicit Repr(std::nullptr_t) + : singletonRepr_(nullptr) {} + + explicit Repr(SingletonTypePtr p) + : singletonRepr_(p.get()) {} + + ~Repr() { + destroy(); + } + + // NOTE: the only non-UB way to access our null state is through + // rawRepr(), because our copy operation doesn't preserve which + // union member is active for null pointers. + Repr(const Repr& rhs) { + if (rhs.isSharedAndNonNull()) { + new (&shared_) SharedPtrWrapper(rhs.shared_); + } else { + singletonRepr_.singleton_ = static_cast(rhs.rawRepr().first); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.singletonRepr_.unused_ == nullptr); + singletonRepr_.unused_ = nullptr; + } + } + + Repr(Repr&& rhs) noexcept { + if (rhs.isSharedAndNonNull()) { + new (&shared_) SharedPtrWrapper(std::move(rhs.shared_)); + } else { + singletonRepr_.singleton_ = static_cast(rhs.rawRepr().first); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.singletonRepr_.unused_ == nullptr); + singletonRepr_.unused_ = nullptr; + } + } + + Repr& operator=(const Repr& rhs) { + if (&rhs == this) { + return *this; + } + if (rhs.isSharedAndNonNull()) { + if (isSharedAndNonNull()) { + shared_ = rhs.shared_; + } else { + new (&shared_) SharedPtrWrapper(rhs.shared_); + } + } else { + if (isSharedAndNonNull()) { + destroy(); + } + singletonRepr_.singleton_ = static_cast(rhs.rawRepr().first); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.rawRepr().nullIfSingleton_ == nullptr); + singletonRepr_.unused_ = nullptr; + } + return *this; + } + + Repr& operator=(Repr&& rhs) noexcept { + if (&rhs == this) { + return *this; + } + if (rhs.isSharedAndNonNull()) { + if (isSharedAndNonNull()) { + shared_ = std::move(rhs.shared_); + } else { + new (&shared_) SharedPtrWrapper(std::move(rhs.shared_)); + } + } else { + if (isSharedAndNonNull()) { + destroy(); + } + singletonRepr_.singleton_ = static_cast(rhs.rawRepr().first); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.rawRepr().nullIfSingleton_ == nullptr); + singletonRepr_.unused_ = nullptr; + } + return *this; + } + + SharedPtrWrapper shared_; + + struct SingletonRepr { + explicit SingletonRepr(T* s) : singleton_(s) {} + T* singleton_; + void* unused_ = nullptr; + } singletonRepr_; + struct RawRepr { + void* first; + void* nullIfSingleton_; + }; + + // It is UB to read the singleton part of Repr if it was + // constructed as a shared_ptr and vice versa, but memcpying out + // the representation is always OK, so here's an accessor to obey + // the letter of the law. + RawRepr rawRepr() const { + RawRepr repr{}; + memcpy(&repr, reinterpret_cast(this), sizeof(RawRepr)); + return repr; + } + + bool isNonNull() const { + auto repr = rawRepr(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(repr.nullIfSingleton_ == nullptr || repr.first != nullptr); + return repr.first != nullptr; + } + + bool isSharedAndNonNull() const { + return rawRepr().nullIfSingleton_ != nullptr; + } + + private: + void destroy() { + if (isSharedAndNonNull()) { + // Without SharedPtrWrapper, this line would read + // `shared_.~shared_ptr()` and nvcc would complain with + // "error: expected primary-expression before '>' token" + // referring to the "t" in "shared_ptr". SharedPtrWrapper + // exists to work around this compiler bug. + shared_.~SharedPtrWrapper(); + } + } + } repr_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; using TypePtr = SingletonOrSharedTypePtr; @@ -553,7 +727,11 @@ inline TypePtr Type::withContained(std::vector contained_types) { } +<<<<<<< HEAD inline bool operator==(const Type& lhs, const Type& rhs) { +======= +TORCH_API inline bool operator==(const Type& lhs, const Type& rhs) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (C10_UNLIKELY(!rhs.symmetric())) { return rhs.equals(lhs); } diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h index 0ee79ed85930b..adff65171c040 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.h +++ b/aten/src/ATen/core/op_registration/infer_schema.h @@ -44,7 +44,11 @@ constexpr int checkStaticTypes() { } template +<<<<<<< HEAD constexpr std::array createArgumentVectorFromTypes(std::index_sequence /*unused*/) { +======= +constexpr std::array createArgumentVectorFromTypes(std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ( // Check types for common errors checkStaticTypes(), diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h index 1f39ba4e38717..04ff9c3089850 100644 --- a/aten/src/ATen/core/op_registration/op_allowlist.h +++ b/aten/src/ATen/core/op_registration/op_allowlist.h @@ -114,7 +114,11 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i } next++; } else { +<<<<<<< HEAD if (allowlist.substr(cur) == item) { +======= + if (allowlist.substr(cur).compare(item) == 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return true; } break; diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp index b34134309cb7a..4bbe62b1ab16b 100644 --- a/aten/src/ATen/core/op_registration/op_registration.cpp +++ b/aten/src/ATen/core/op_registration/op_registration.cpp @@ -73,7 +73,11 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_( std::optional inferred_schema = std::nullopt; for (const auto& kernel : options.kernels) { +<<<<<<< HEAD if (nullptr != kernel.inferred_function_schema) { +======= + if (nullptr != kernel.inferred_function_schema.get()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!inferred_schema.has_value()) { inferred_schema = *kernel.inferred_function_schema; break; diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h index a2e8d9e2a9e1d..5c6b1e23b0d16 100644 --- a/aten/src/ATen/core/op_registration/op_registration.h +++ b/aten/src/ATen/core/op_registration/op_registration.h @@ -21,7 +21,11 @@ namespace c10 { namespace detail { // The first argument of the schema might be of type DispatchKeySet, in which case we remove it. +<<<<<<< HEAD // We do this because every argument in a function schema is expected to be convertible +======= +// We do this because every argument in a function schema is expected to be convertable +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of. // See Note [Plumbing Keys Through The Dispatcher] template @@ -411,6 +415,10 @@ class TORCH_API RegisterOperators final { Options() : schemaOrName_(std::nullopt) +<<<<<<< HEAD +======= + , kernels() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) , aliasAnalysisKind_(std::nullopt) {} @@ -419,6 +427,10 @@ class TORCH_API RegisterOperators final { struct KernelRegistrationConfig final { KernelRegistrationConfig() : dispatch_key(std::nullopt) +<<<<<<< HEAD +======= + , func() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) , cpp_signature(std::nullopt) , inferred_function_schema(nullptr) {} diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index e3f68128a9e14..dd854589a854e 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -251,7 +251,11 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox callOpUnboxedWithPrecomputedDispatchKeySet(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA)); EXPECT_TRUE(called_kernel_cpu); +<<<<<<< HEAD // Ensure that dispatch key from tensor is not used here. +======= + // Ensure that disptach key from tensor is not used here. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) called_kernel_cpu = false; expectThrows([&] { callOpUnboxedWithPrecomputedDispatchKeySet(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU)); diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h index 4c138ee504564..fd81a994b00ea 100644 --- a/aten/src/ATen/core/operator_name.h +++ b/aten/src/ATen/core/operator_name.h @@ -83,7 +83,11 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) { } TORCH_API std::string toString(const OperatorName& opName); +<<<<<<< HEAD TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/); +======= +TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace c10 diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp index 9d8080cb8f317..c5d534a8e0973 100644 --- a/aten/src/ATen/core/tensor_type.cpp +++ b/aten/src/ATen/core/tensor_type.cpp @@ -172,7 +172,11 @@ VaryingShape TensorType::computeStrideProps( // The logic below follows what TensorIterator uses in its logic: // 1. Fast_set_up is the short-cut to identify a. channels_last and // b. contiguous format, which is what we have in the below logic. +<<<<<<< HEAD // 2. In more general cases, it does best effort to preserve permutatoin. +======= + // 2. In more generla cases, it does best effort to preserve permutatoin. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) { // case 1.a. short cut channels last std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2); diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index abba4e14583a3..8d9e301cd60ec 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -8,7 +8,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -827,7 +830,13 @@ TupleType::TupleType( : NamedType(TypeKind::TupleType, std::move(name)), elements_(std::move(elements)), has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) { +<<<<<<< HEAD TORCH_CHECK(v, "Can not create tuple with None type"); +======= + if (!v) { + throw std::runtime_error("Can not create tuple with None type"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return v->hasFreeVariables(); })), schema_(std::move(schema)) { diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h index 011a1750ecaa0..64f883ddb8bf9 100644 --- a/aten/src/ATen/core/type_ptr.h +++ b/aten/src/ATen/core/type_ptr.h @@ -16,7 +16,11 @@ class SingletonTypePtr { /* implicit */ SingletonTypePtr(T* p) : repr_(p) {} // We need this to satisfy Pybind11, but it shouldn't be hit. +<<<<<<< HEAD explicit SingletonTypePtr(std::shared_ptr /*unused*/) { TORCH_CHECK(false); } +======= + explicit SingletonTypePtr(std::shared_ptr) { TORCH_CHECK(false); } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using element_type = typename std::shared_ptr::element_type; diff --git a/aten/src/ATen/cpu/vec/intrinsics.h b/aten/src/ATen/cpu/vec/intrinsics.h index 70223700f6364..65752fe8628cf 100644 --- a/aten/src/ATen/cpu/vec/intrinsics.h +++ b/aten/src/ATen/cpu/vec/intrinsics.h @@ -1 +1,59 @@ +<<<<<<< HEAD #include +======= +#pragma once +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC or clang-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__)) +/* Clang-compatible compiler, targeting arm neon */ +#include +#if defined(__ARM_FEATURE_SVE) +/* CLANG-compatible compiler, targeting ARM with SVE */ +#include +#endif +#elif defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#if _MSC_VER <= 1900 +#define _mm256_extract_epi64(X, Y) \ + (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2)) +#define _mm256_extract_epi32(X, Y) \ + (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4)) +#define _mm256_extract_epi16(X, Y) \ + (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8)) +#define _mm256_extract_epi8(X, Y) \ + (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16)) +#endif +#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__)) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#if defined(__ARM_FEATURE_SVE) +/* GCC-compatible compiler, targeting ARM with SVE */ +#include +#endif +#if defined(MISSING_ARM_VLD1) +#include +#elif defined(MISSING_ARM_VST1) +#include +#endif +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif defined(__s390x__) +// targets Z/architecture +// we will include vecintrin later +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +/* We need to undef those tokens defined by to avoid conflicts + with the C++ types. => Can still use __bool/__vector */ +#undef bool +#undef vector +#undef pixel +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index 9e0b189bdac89..c82c34f7be0bb 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -5,7 +5,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include namespace at { namespace vec { @@ -37,7 +40,11 @@ class Vectorized { return VECTOR_WIDTH / sizeof(BFloat16); } +<<<<<<< HEAD Vectorized(); +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(svbfloat16_t v) : values(v) {} Vectorized(int val); Vectorized(BFloat16 val); @@ -164,9 +171,12 @@ class Vectorized { Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized fmod(const Vectorized& q) const; Vectorized hypot(const Vectorized& b) const; Vectorized i0() const; @@ -224,12 +234,17 @@ class Vectorized { Vectorized le(const Vectorized& other) const; }; +<<<<<<< HEAD #if defined(__GNUC__) && __GNUC__ == 14 // Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE __attribute__((optimize("no-tree-vectorize"))) #endif inline std::tuple, Vectorized> convert_bfloat16_float(const Vectorized& a) { +======= +inline std::tuple, Vectorized> convert_bfloat16_float( + const Vectorized& a) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static_assert( Vectorized::size() == 2 * Vectorized::size()); auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); @@ -307,11 +322,14 @@ Vectorized inline operator/( return binary_operator_via_float(std::divides>(), a, b); } +<<<<<<< HEAD inline Vectorized::Vectorized() { auto vals_f = svdup_n_f32(0); values = convert_float_bfloat16(vals_f, vals_f); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized::Vectorized(int val) { auto vals_f = svdup_n_f32(val); values = convert_float_bfloat16(vals_f, vals_f); diff --git a/aten/src/ATen/cpu/vec/sve/vec_double.h b/aten/src/ATen/cpu/vec/sve/vec_double.h index 474652be17a1a..6867d1687451a 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_double.h +++ b/aten/src/ATen/cpu/vec/sve/vec_double.h @@ -38,9 +38,13 @@ class Vectorized { static constexpr size_type size() { return VECTOR_WIDTH / sizeof(double); } +<<<<<<< HEAD Vectorized() { values = svdup_n_f64(0); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(svfloat64_t v) : values(v) {} Vectorized(double val) { values = svdup_n_f64(val); @@ -251,9 +255,12 @@ class Vectorized { Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized fmod(const Vectorized& q) const {USE_SLEEF( { return Vectorized(Sleef_fmoddx_sve(values, q)); }, { @@ -587,6 +594,7 @@ Vectorized inline fmadd( return svmad_f64_x(ptrue, a, b, c); } +<<<<<<< HEAD template <> Vectorized inline fnmadd( const Vectorized& a, @@ -611,6 +619,8 @@ Vectorized inline fnmsub( return svnmad_f64_x(ptrue, a, b, c); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // defined(CPU_CAPABILITY_SVE) } // namespace CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/sve/vec_float.h b/aten/src/ATen/cpu/vec/sve/vec_float.h index 7e7a8fe682ff6..24895107dd778 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_float.h +++ b/aten/src/ATen/cpu/vec/sve/vec_float.h @@ -38,9 +38,13 @@ class Vectorized { static constexpr size_type size() { return VECTOR_WIDTH / sizeof(float); } +<<<<<<< HEAD Vectorized() { values = svdup_n_f32(0); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(svfloat32_t v) : values(v) {} Vectorized(float val) { values = svdup_n_f32(val); @@ -104,6 +108,74 @@ class Vectorized { } return b; } +<<<<<<< HEAD +======= + // Implementation is picked from + // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L105 + inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const { + const auto c1 = + svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f + const auto c2 = + svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f + const auto c3 = + svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f + const auto c4 = + svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f + const auto c5 = + svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); // x^5: 0x1.0e4020p-7f + const auto shift = svreinterpret_f32_u32( + svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto inv_ln2 = svreinterpret_f32_u32( + svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f + const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32( + 0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f + const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32( + 0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f + const auto inf = svdup_n_f32(std::numeric_limits::infinity()); + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto zero = svdup_n_f32(0.f); + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + // Range reduction: + // e^x = 2^n * e^r + // where: + // n = floor(x / ln(2)) + // r = x - n * ln(2) + // + // By adding x / ln(2) with 2^23 + 127 (shift): + // * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 + // forces decimal part + // of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. + // n) + 127 will occupy the whole fraction part of z in FP32 format. + // Subtracting 2^23 + 127 (shift) from z will result in the integer part + // of x / ln(2) (i.e. n) because the decimal part has been pushed out + // and lost. + // * The addition of 127 makes the FP32 fraction part of z ready to be + // used as the exponent + // in FP32 format. Left shifting z by 23 bits will result in 2^n. + const auto z = svmla_f32_z(pg, shift, x, inv_ln2); + const auto n = svsub_f32_z(pg, z, shift); + const auto scale = svreinterpret_f32_u32( + svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + // The calculation of n * ln(2) is done using 2 steps to achieve accuracy + // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in + // term of accuracy and performance. + const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + // Compute the truncated Taylor series of e^r. + // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5) + const auto r2 = svmul_f32_z(pg, r, r); + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); + auto poly = svmla_f32_z(pg, scale, p12345, scale); + // Handle underflow and overflow. + poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly); + poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly); + return poly; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static Vectorized loadu(const void* ptr, int64_t count = size()) { if (count == size()) return svld1_f32(ptrue, reinterpret_cast(ptr)); @@ -248,6 +320,7 @@ class Vectorized { return USE_SLEEF( Vectorized(Sleef_expm1fx_u10sve(values)), map(std::expm1)); } +<<<<<<< HEAD // Implementation copied from Arm Optimized Routines: // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c Vectorized exp_u20() const { @@ -283,6 +356,10 @@ class Vectorized { } Vectorized fexp_u20() const { return exp_u20(); +======= + Vectorized exp_u20() const { + return exp(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized fmod(const Vectorized& q) const {USE_SLEEF( { return Vectorized(Sleef_fmodfx_sve(values, q)); }, @@ -418,11 +495,17 @@ class Vectorized { ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH); // Step 2: Calculate exp(2 * x), where x is the clamped value. +<<<<<<< HEAD // svmul_f32_z computes 2 * x, and exp_u20() computes the exponential of // the result (via Vectorized, then auto-converts back to // svfloat32_t). svfloat32_t exp2x = Vectorized(svmul_f32_z(ptrue, CONST_2, x)).exp_u20(); +======= + // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of + // the result. + svfloat32_t exp2x = svexp_f32_z(ptrue, svmul_f32_z(ptrue, CONST_2, x)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Step 3: Calculate the numerator of the tanh function, which is exp(2x) // - 1. @@ -725,6 +808,7 @@ Vectorized inline fmadd( return svmad_f32_x(ptrue, a, b, c); } +<<<<<<< HEAD template <> Vectorized inline fnmadd( const Vectorized& a, @@ -749,6 +833,8 @@ Vectorized inline fnmsub( return svnmad_f32_x(ptrue, a, b, c); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // defined(CPU_CAPABILITY_SVE) } // namespace CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/sve/vec_int.h b/aten/src/ATen/cpu/vec/sve/vec_int.h index f0bc42caa9502..2a5f8a2468851 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_int.h +++ b/aten/src/ATen/cpu/vec/sve/vec_int.h @@ -32,9 +32,13 @@ inline namespace CPU_CAPABILITY { static constexpr size_type size() { \ return vl; \ } \ +<<<<<<< HEAD Vectorized() { \ values = svdup_n_s##bit(0); \ } \ +======= + Vectorized() {} \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(svint##bit##_t v) : values(v) {} \ Vectorized(int##bit##_t val) { \ values = svdup_n_s##bit(val); \ diff --git a/aten/src/ATen/cpu/vec/vec128/vec128.h b/aten/src/ATen/cpu/vec/vec128/vec128.h index 9f9079d475a8f..d91194c0e19ac 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128.h @@ -6,11 +6,16 @@ #ifdef __aarch64__ #if !defined(CPU_CAPABILITY_SVE) #include +<<<<<<< HEAD #include #include #include #include #include +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif #include diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h index aae7f2a79c2ea..84580cbb86ebb 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h @@ -354,6 +354,7 @@ class Vectorized : public Vectorized16< DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs) Vectorized frac() const; +<<<<<<< HEAD DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc) DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt) @@ -395,6 +396,11 @@ class Vectorized : public Vectorized16< } #else DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg) +======= + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg) + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc) + DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal) DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==) DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=) @@ -402,7 +408,10 @@ class Vectorized : public Vectorized16< DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=) DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>) DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=) +<<<<<<< HEAD #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD #undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD @@ -451,6 +460,7 @@ template <> Vectorized inline operator+( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD #ifdef __ARM_FEATURE_BF16 bfloat16x8_t x = a; bfloat16x8_t y = b; @@ -458,12 +468,16 @@ Vectorized inline operator+( #else return binary_operator_via_float(std::plus>(), a, b); #endif +======= + return binary_operator_via_float(std::plus>(), a, b); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator-( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD #ifdef __ARM_FEATURE_BF16 bfloat16x8_t x = a; bfloat16x8_t y = b; @@ -471,12 +485,16 @@ Vectorized inline operator-( #else return binary_operator_via_float(std::minus>(), a, b); #endif +======= + return binary_operator_via_float(std::minus>(), a, b); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator*( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD #ifdef __ARM_FEATURE_BF16 bfloat16x8_t x = a; bfloat16x8_t y = b; @@ -484,12 +502,16 @@ Vectorized inline operator*( #else return binary_operator_via_float(std::multiplies>(), a, b); #endif +======= + return binary_operator_via_float(std::multiplies>(), a, b); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> Vectorized inline operator/( const Vectorized& a, const Vectorized& b) { +<<<<<<< HEAD #ifdef __ARM_FEATURE_BF16 bfloat16x8_t x = a; bfloat16x8_t y = b; @@ -497,6 +519,9 @@ Vectorized inline operator/( #else return binary_operator_via_float(std::divides>(), a, b); #endif +======= + return binary_operator_via_float(std::divides>(), a, b); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // frac. Implement this here so we can use subtraction @@ -607,18 +632,22 @@ Vectorized inline fmadd( const Vectorized& a, const Vectorized& b, const Vectorized& c) { +<<<<<<< HEAD #ifdef __ARM_FEATURE_BF16 bfloat16x8_t x = a; bfloat16x8_t y = b; bfloat16x8_t z = c; return x * y + z; #else +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16! Also, // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered // elements, not the bottom and top half, so they don't seem // particularly useful here. Ideally we would include dot product in // the Vectorized interface... return a * b + c; +<<<<<<< HEAD #endif } @@ -636,6 +665,8 @@ Vectorized inline fnmadd( // See NOTE [BF16 FMA] above. return -a * b + c; #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -643,6 +674,7 @@ Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, const Vectorized& c) { +<<<<<<< HEAD #ifdef __ARM_FEATURE_BF16 bfloat16x8_t x = a; bfloat16x8_t y = b; @@ -668,6 +700,10 @@ Vectorized inline fnmsub( // See NOTE [BF16 FMA] above. return -a * b - c; #endif +======= + // See NOTE [BF16 FMA] above. + return a * b - c; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif // !defined(C10_MOBILE) && defined(__aarch64__) diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h index b2e6016bcc12e..a95484b0596b2 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h @@ -5,6 +5,7 @@ namespace at::vec { inline namespace CPU_CAPABILITY { #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)) +<<<<<<< HEAD // Enable auto-vectorization for GCC-13+ and clang-17+ // GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001 @@ -249,6 +250,8 @@ inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) { #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template struct VecConvert< float, diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h index 67760ec967aa1..1198feadb52ee 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h @@ -83,9 +83,13 @@ class Vectorized { static constexpr size_type size() { return 4; } +<<<<<<< HEAD Vectorized() { values = vmovq_n_f32(0); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(float32x4_t v) : values(v) {} Vectorized(float val) : values{vdupq_n_f32(val)} {} Vectorized(float val0, float val1, float val2, float val3) @@ -204,6 +208,7 @@ class Vectorized { store(tmp); return tmp[idx]; } +<<<<<<< HEAD int zero_mask() const { uint32x4_t is_zero_vec = vceqzq_f32(values); const int32x4_t shift = vcombine_s32( @@ -212,6 +217,20 @@ class Vectorized { uint32x4_t bits_vec = vshlq_u32(vandq_u32(is_zero_vec, vdupq_n_u32(1)), shift); return vaddvq_u32(bits_vec); +======= + // For boolean version where we want to if any 1/all zero + // etc. can be done faster in a different way. + int zero_mask() const { + __at_align__ float tmp[size()]; + store(tmp); + int mask = 0; + for (int i = 0; i < size(); ++i) { + if (tmp[i] == 0.f) { + mask |= (1 << i); + } + } + return mask; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Vectorized isnan() const { return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, values))); @@ -307,6 +326,7 @@ class Vectorized { DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2) DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1) +<<<<<<< HEAD // Implementation copied from Arm Optimized Routine // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c Vectorized exp_u20() const { @@ -350,6 +370,10 @@ class Vectorized { } Vectorized fexp_u20() const { return exp_u20(); +======= + Vectorized exp_u20() const { + return exp(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( fmod, @@ -579,6 +603,45 @@ inline Vectorized Vectorized::le( } template <> +<<<<<<< HEAD +======= +inline void convert(const float* src, int32_t* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i))); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i))); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmadd( const Vectorized& a, const Vectorized& b, @@ -587,6 +650,7 @@ Vectorized inline fmadd( } template <> +<<<<<<< HEAD Vectorized inline fnmadd( const Vectorized& a, const Vectorized& b, @@ -595,6 +659,8 @@ Vectorized inline fnmadd( } template <> +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, @@ -602,6 +668,7 @@ Vectorized inline fmsub( return Vectorized(vnegq_f32(vfmsq_f32(c, a, b))); } +<<<<<<< HEAD template <> Vectorized inline fnmsub( const Vectorized& a, @@ -610,6 +677,8 @@ Vectorized inline fnmsub( return Vectorized(vnegq_f32(vfmaq_f32(c, a, b))); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized Vectorized::erf() const { // constants const Vectorized neg_zero_vec(-0.f); @@ -634,7 +703,12 @@ inline Vectorized Vectorized::erf() const { // - exp(- x * x) auto pow_2 = (*this) * (*this); auto neg_pow_2 = pow_2 ^ neg_zero_vec; +<<<<<<< HEAD auto tmp4 = neg_pow_2.exp(); +======= + auto tmp4 = neg_pow_2.map( + std::exp); // This can be swapped for a faster implementation of exp. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto tmp5 = tmp4 ^ neg_zero_vec; // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) auto tmp6 = t * tmp5; diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h index c40480ec73ac2..e9925a3def4de 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h @@ -220,6 +220,7 @@ class Vectorized : public Vectorized16< std::memcpy(ptr, tmp_values, count * sizeof(float16_t)); } } +<<<<<<< HEAD int zero_mask() const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC uint16x8_t is_zero_vec = vceqzq_f16(values); @@ -246,6 +247,10 @@ class Vectorized : public Vectorized16< return mask; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC } +======= + // For boolean version where we want to if any 1/all zero + // etc. can be done faster in a different way. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized isnan() const { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC return vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, values))); @@ -569,6 +574,49 @@ inline Vectorized Vectorized::le( return (*this <= other) & Vectorized(1); } +<<<<<<< HEAD +======= +// These are global functions, so the defaults in vec_base.h should +// work fine if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is not available. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void convert(const float16_t* src, int16_t* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i))); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int16_t* src, float16_t* dst, int64_t n) { + int64_t i; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i))); + } +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> Vectorized inline fmadd( const Vectorized& a, @@ -582,6 +630,7 @@ Vectorized inline fmadd( } template <> +<<<<<<< HEAD Vectorized inline fnmadd( const Vectorized& a, const Vectorized& b, @@ -594,6 +643,8 @@ Vectorized inline fnmadd( } template <> +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, @@ -604,6 +655,7 @@ Vectorized inline fmsub( return a * b - c; #endif } +<<<<<<< HEAD template <> Vectorized inline fnmsub( @@ -616,6 +668,8 @@ Vectorized inline fnmsub( return -a * b - c; #endif } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // !defined(C10_MOBILE) && defined(__aarch64__) } // namespace CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h index 5fb3679f37239..c7855f0a4fb4d 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h @@ -206,10 +206,13 @@ struct Vectorized16 { return static_cast(this)->map_with_vec_float_method( &Vectorized::exp_u20); } +<<<<<<< HEAD Derived fexp_u20() const { return static_cast(this)->map_with_vec_float_method( &Vectorized::exp_u20); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Derived fmod(const Derived& q) const { // This function is questionable with a conversion, so we use map2 return map2(q, std::fmod); diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h index aa40000b6ccdb..21ae6d5aef1a8 100644 --- a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h +++ b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h @@ -1 +1,400 @@ +<<<<<<< HEAD #include +======= +/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */ + +__extension__ extern __inline uint8x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u8_x2(const uint8_t* __a) { + uint8x8x2_t ret; + asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int8x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s8_x2(const int8_t* __a) { + int8x8x2_t ret; + asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint16x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u16_x2(const uint16_t* __a) { + uint16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int16x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s16_x2(const int16_t* __a) { + int16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint32x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u32_x2(const uint32_t* __a) { + uint32x2x2_t ret; + asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int32x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s32_x2(const int32_t* __a) { + int32x2x2_t ret; + asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint64x1x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_u64_x2(const uint64_t* __a) { + uint64x1x2_t ret; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int64x1x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_s64_x2(const int64_t* __a) { + int64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline float16x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_f16_x2(const float16_t* __a) { + float16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline float32x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_f32_x2(const float32_t* __a) { + float32x2x2_t ret; + asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline float64x1x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_f64_x2(const float64_t* __a) { + float64x1x2_t ret; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline poly8x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_p8_x2(const poly8_t* __a) { + poly8x8x2_t ret; + asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline poly16x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_p16_x2(const poly16_t* __a) { + poly16x4x2_t ret; + asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline poly64x1x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1_p64_x2(const poly64_t* __a) { + poly64x1x2_t ret; + asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint8x16x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u8_x2(const uint8_t* __a) { + uint8x16x2_t ret; + asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int8x16x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s8_x2(const int8_t* __a) { + int8x16x2_t ret; + asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint16x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u16_x2(const uint16_t* __a) { + uint16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int16x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s16_x2(const int16_t* __a) { + int16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint32x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u32_x2(const uint32_t* __a) { + uint32x4x2_t ret; + asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int32x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s32_x2(const int32_t* __a) { + int32x4x2_t ret; + asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline uint64x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_u64_x2(const uint64_t* __a) { + uint64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline int64x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_s64_x2(const int64_t* __a) { + int64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline float16x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_f16_x2(const float16_t* __a) { + float16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline float32x4x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_f32_x2(const float32_t* __a) { + float32x4x2_t ret; + asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline float64x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_f64_x2(const float64_t* __a) { + float64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline poly8x16x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_p8_x2(const poly8_t* __a) { + poly8x16x2_t ret; + asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline poly16x8x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_p16_x2(const poly16_t* __a) { + poly16x8x2_t ret; + asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +__extension__ extern __inline poly64x2x2_t + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vld1q_p64_x2(const poly64_t* __a) { + poly64x2x2_t ret; + asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); + return ret; +} + +/* vst1x2 */ + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s64_x2(int64_t* __a, int64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_f64_x2(float64_t* __a, float64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s8_x2(int8_t* __a, int8x8x2_t val) { + asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) { + asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s16_x2(int16_t* __a, int16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_s32_x2(int32_t* __a, int32x2x2_t val) { + asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) { + asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) { + asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_f16_x2(float16_t* __a, float16x4x2_t val) { + asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_f32_x2(float32_t* __a, float32x2x2_t val) { + asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) { + asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s8_x2(int8_t* __a, int8x16x2_t val) { + asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) { + asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s16_x2(int16_t* __a, int16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s32_x2(int32_t* __a, int32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_s64_x2(int64_t* __a, int64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) { + asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f16_x2(float16_t* __a, float16x8x2_t val) { + asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f32_x2(float32_t* __a, float32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f64_x2(float64_t* __a, float64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) { + asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h index b3d721531d246..c2c0c0d91e29c 100644 --- a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h +++ b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h @@ -1 +1,11 @@ +<<<<<<< HEAD #include +======= +/* Workaround for missing vst1q_f32_x2 in gcc-8. */ + +__extension__ extern __inline void + __attribute__((__always_inline__, __gnu_inline__, __artificial__)) + vst1q_f32_x2(float32_t* __a, float32x4x2_t val) { + asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h index 425fb6aa79e13..19eed91ff9199 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h @@ -488,9 +488,12 @@ class Vectorized16 { Vectorized expm1() const { return map(Sleef_expm1f8_u10); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized exp_u20() const { return exp(); } diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h index 735315bee7686..00ce79a218825 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -34,9 +34,13 @@ class Vectorized> { static constexpr size_type size() { return 2; } +<<<<<<< HEAD Vectorized() { values = _mm256_setzero_pd(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m256d v) : values(v) {} Vectorized(c10::complex val) { double real_value = val.real(); @@ -342,6 +346,7 @@ class Vectorized> { return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); } Vectorized> operator<( +<<<<<<< HEAD const Vectorized>& /*unused*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } @@ -355,6 +360,21 @@ class Vectorized> { } Vectorized> operator>=( const Vectorized>& /*unused*/) const { +======= + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator<=( + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>( + const Vectorized>&) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vectorized> operator>=( + const Vectorized>&) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "not supported for complex numbers"); } diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h index 5d8c69a34b9d2..4b6f518e96c7c 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h @@ -33,9 +33,13 @@ class Vectorized> { static constexpr size_type size() { return 4; } +<<<<<<< HEAD Vectorized() { values = _mm256_setzero_ps(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m256 v) : values(v) {} Vectorized(c10::complex val) { float real_value = val.real(); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h index d5abafedec2e6..75df7b555381a 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h @@ -31,9 +31,13 @@ class Vectorized { static constexpr size_type size() { return 4; } +<<<<<<< HEAD Vectorized() { values = _mm256_setzero_pd(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m256d v) : values(v) {} Vectorized(double val) { values = _mm256_set1_pd(val); @@ -200,9 +204,12 @@ class Vectorized { Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized fmod(const Vectorized& q) const { return Vectorized(Sleef_fmodd4(values, q)); } @@ -496,6 +503,7 @@ Vectorized inline fmadd( } template <> +<<<<<<< HEAD Vectorized inline fnmadd( const Vectorized& a, const Vectorized& b, @@ -504,12 +512,15 @@ Vectorized inline fnmadd( } template <> +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, const Vectorized& c) { return _mm256_fmsub_pd(a, b, c); } +<<<<<<< HEAD template <> Vectorized inline fnmsub( @@ -518,6 +529,8 @@ Vectorized inline fnmsub( const Vectorized& c) { return _mm256_fnmsub_pd(a, b, c); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif #endif diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h index a42a51e567a63..c8584f3ad84b8 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h @@ -1,4 +1,8 @@ #pragma once +<<<<<<< HEAD +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] @@ -30,9 +34,13 @@ class Vectorized { static constexpr size_type size() { return 8; } +<<<<<<< HEAD Vectorized() { values = _mm256_setzero_ps(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m256 v) : values(v) {} Vectorized(float val) { values = _mm256_set1_ps(val); @@ -257,6 +265,7 @@ class Vectorized { Vectorized expm1() const { return Vectorized(Sleef_expm1f8_u10(values)); } +<<<<<<< HEAD Vectorized fexp_u20() const { const __m256 vec_c0 = _mm256_set1_ps(0.00010703434948458272f); const __m256 vec_c1 = _mm256_set1_ps(0.30354260500649682f); @@ -314,6 +323,8 @@ class Vectorized { return result; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized exp_u20() const { // A faster version of exp with ULP=20 const __m256 vec_factorial_1 = @@ -697,6 +708,7 @@ Vectorized inline fmadd( } template <> +<<<<<<< HEAD Vectorized inline fnmadd( const Vectorized& a, const Vectorized& b, @@ -705,6 +717,8 @@ Vectorized inline fnmadd( } template <> +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, @@ -712,6 +726,7 @@ Vectorized inline fmsub( return _mm256_fmsub_ps(a, b, c); } +<<<<<<< HEAD template <> Vectorized inline fnmsub( const Vectorized& a, @@ -720,6 +735,8 @@ Vectorized inline fnmsub( return _mm256_fnmsub_ps(a, b, c); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle) // Used by Inductor CPP codegen for micro gemm inline void transpose_block(at::vec::VectorizedN& input) { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index 998177758be8d..ce695e89a1b7e 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -23,9 +23,13 @@ struct Vectorizedi { } public: +<<<<<<< HEAD Vectorizedi() { values = _mm256_setzero_si256(); } +======= + Vectorizedi() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorizedi(__m256i v) : values(v) {} operator __m256i() const { return values; @@ -55,9 +59,13 @@ class Vectorized : public Vectorizedi { return 4; } using Vectorizedi::Vectorizedi; +<<<<<<< HEAD Vectorized() { values = _mm256_setzero_si256(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(int64_t v) { values = _mm256_set1_epi64x(v); } @@ -905,7 +913,11 @@ class Vectorized8 : public Vectorizedi { // Because loadu(const void* ptr, T count) requires zero initialization for // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128 // bits of the result are undefined. +<<<<<<< HEAD // TODO We can use _mm256_zextsi128_si256 in the future, +======= + // TODO We can use _mm256_zextsi128_si256 in the furture, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // since gcc 9.3 doesn't support it now. __m128i input_128 = _mm_loadl_epi64(reinterpret_cast(ptr)); return _mm256_castsi128_si256(input_128); @@ -1740,7 +1752,11 @@ Vectorized inline shift_256_16( // Control masks for shuffle operation, treating 256 bits as an // array of 16-bit elements, and considering pairs of neighboring +<<<<<<< HEAD // elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and +======= + // elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // M!=N) is set so that shuffle will move element with index M from // input pair into element with index N in output pair, and element // with index M in output pair will be set to all 0s. @@ -1844,7 +1860,11 @@ Vectorized inline shift_256_16( c0 = _mm256_srav_epi32(a0, b0); c0 = _mm256_shuffle_epi8(c0, ctl_1_0); +<<<<<<< HEAD // Perform shifting the same way for input array elements with +======= + // Peform shifting the same way for input array elements with +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // idx%2==1. __m256i a1 = _mm256_and_si256(a, keep_1); __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); @@ -1875,7 +1895,11 @@ Vectorized inline shift_256_8( // Control masks for shuffle operation, treating 256 bits as an // array of 8-bit elements, and considering quadruples of +<<<<<<< HEAD // neighboring elements. Specifically, a mask named "ctl_M_N" (M,N +======= + // neighboring elements. Specifially, a mask named "ctl_M_N" (M,N +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // in [0,1,2,3], and M!=N) is set so that shuffle will move element // with index M from input quadruple into element with index N in // output quadruple, and other elements in output quadruple will be @@ -2180,7 +2204,11 @@ Vectorized inline shift_256_8( c0 = _mm256_srlv_epi32(a0, b0); c0 = _mm256_shuffle_epi8(c0, ctl_3_0); +<<<<<<< HEAD // Perform shifting the same way for input array elements with +======= + // Peform shifting the same way for input array elements with +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // idx%4==1. __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3); __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); @@ -2193,7 +2221,11 @@ Vectorized inline shift_256_8( c1 = _mm256_srlv_epi32(a1, b1); c1 = _mm256_shuffle_epi8(c1, ctl_3_1); +<<<<<<< HEAD // Perform shifting the same way for input array elements with +======= + // Peform shifting the same way for input array elements with +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // idx%4==2. __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3); __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0); @@ -2206,7 +2238,11 @@ Vectorized inline shift_256_8( c2 = _mm256_srlv_epi32(a2, b2); c2 = _mm256_shuffle_epi8(c2, ctl_3_2); +<<<<<<< HEAD // Perform shifting the same way for input array elements with +======= + // Peform shifting the same way for input array elements with +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // idx%4==3. __m256i a3 = _mm256_and_si256(a, keep_3); __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index 2b70564b9ca81..84b7eff128732 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -54,9 +54,13 @@ struct Vectorizedqi { #endif public: +<<<<<<< HEAD Vectorizedqi() { vals = _mm256_setzero_si256(); } +======= + Vectorizedqi() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorizedqi(__m256i v) : vals(v) {} operator __m256i() const { return vals; @@ -123,29 +127,46 @@ typename std::enable_if_t< } template +<<<<<<< HEAD at::vec::Vectorized inline convert_float_to_int8( at::vec::Vectorized src); template <> at::vec::Vectorized inline convert_float_to_int8( at::vec::Vectorized src) { +======= +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + T>> inline convert_float_to_int8(at::vec::Vectorized src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Convert from float32 to int32 with truncation __m256i x_values_int32 = _mm256_cvttps_epi32(src); // Convert from int32 to int16 using signed saturation __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32); +<<<<<<< HEAD constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); // Convert from int16 to int8 using unsigned saturation __m256i xyzw_clamped_v = pack_saturate_and_clamp( xy_packed_v, xy_packed_v, min_val, max_val); +======= + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + + // Convert from int16 to uint8/int8 using unsigned saturation + __m256i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, xy_packed_v, min_val, max_val); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256i permute_mask_v = _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v); } +<<<<<<< HEAD template <> at::vec::Vectorized inline convert_float_to_int8( at::vec::Vectorized src) { @@ -169,6 +190,8 @@ at::vec::Vectorized inline convert_float_to_int8( return _mm256_castsi128_si256(result); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template __FORCE_INLINE void QuantizeAvx2( const float* src, @@ -1377,7 +1400,11 @@ Vectorized inline maximum( #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)) std::pair, Vectorized> inline convert_int8_to_float( at::vec::Vectorized src) { +<<<<<<< HEAD auto s8x8 = vget_low_s8(src); +======= + auto s8x8 = vld1_s8(src.operator const int8_t*()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto s16x8 = vmovl_s8(s8x8); auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8)); @@ -1390,7 +1417,11 @@ std::pair, Vectorized> inline convert_int8_to_float( std::pair, Vectorized> inline convert_int8_to_float( at::vec::Vectorized src) { +<<<<<<< HEAD auto u8x8 = vget_low_u8(src); +======= + auto u8x8 = vld1_u8(src.operator const uint8_t*()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto u16x8 = vmovl_u8(u8x8); auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8)); auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8)); @@ -1402,7 +1433,11 @@ std::pair, Vectorized> inline convert_int8_to_float( Vectorized inline convert_int8_half_register_to_float( at::vec::Vectorized src) { +<<<<<<< HEAD auto s8x8 = vget_low_s8(src); +======= + auto s8x8 = vld1_s8(src.operator const int8_t*()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto s16x8 = vmovl_s8(s8x8); auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8)); @@ -1412,7 +1447,11 @@ Vectorized inline convert_int8_half_register_to_float( Vectorized inline convert_int8_half_register_to_float( at::vec::Vectorized src) { +<<<<<<< HEAD auto u8x8 = vget_low_u8(src); +======= + auto u8x8 = vld1_u8(src.operator const uint8_t*()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto u16x8 = vmovl_u8(u8x8); auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8)); diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h index db574702f3ee1..7e19ccc8a9352 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h @@ -143,7 +143,11 @@ class Vectorized { const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +<<<<<<< HEAD // the mask used here returned by comparison of vec256 +======= + // the mask used here returned by comparision of vec256 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_sel(a._vec0, b._vec0, mask._vecb0), @@ -273,9 +277,12 @@ class Vectorized { Vectorized C10_ALWAYS_INLINE exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized lgamma() const __ubsan_ignore_undefined__ { return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)}; diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h index 535d3a23173d5..ad7492a56f917 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h @@ -142,7 +142,11 @@ class Vectorized { const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +<<<<<<< HEAD // the mask used here returned by comparison of vec256 +======= + // the mask used here returned by comparision of vec256 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // assuming this we can use the same mask directly with vec_sel return { vec_sel(a._vec0, b._vec0, mask._vecb0), @@ -352,9 +356,12 @@ class Vectorized { Vectorized C10_ALWAYS_INLINE exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized C10_ALWAYS_INLINE fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized C10_ALWAYS_INLINE log() const { return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)}; diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h index 7176dd15d75ed..f860f2dbb6588 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h @@ -202,7 +202,11 @@ class Vectorized { const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +<<<<<<< HEAD // the mask used here returned by comparison of vec256 +======= + // the mask used here returned by comparision of vec256 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // assuming this we can use the same mask directly with vec_sel // warning intel style mask will not work properly return { @@ -349,6 +353,29 @@ class Vectorized { }; template <> +<<<<<<< HEAD +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + vuint16 shift_vec0 = reinterpret_cast(b.vec0()); + vuint16 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + vuint16 shift_vec0 = reinterpret_cast(b.vec0()); + vuint16 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +} + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { @@ -362,8 +389,11 @@ Vectorized inline minimum( return a.minimum(b); } +<<<<<<< HEAD DEFINE_SHIFT_FUNCS(int16_t) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h index 75d3ba381ad41..42a4a996af490 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h @@ -155,7 +155,11 @@ class Vectorized { const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +<<<<<<< HEAD // the mask used here returned by comparison of vec256 +======= + // the mask used here returned by comparision of vec256 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // assuming this we can use the same mask directly with vec_sel // warning intel style mask will not work properly return { @@ -279,6 +283,29 @@ class Vectorized { }; template <> +<<<<<<< HEAD +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + vuint32 shift_vec0 = reinterpret_cast(b.vec0()); + vuint32 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + vuint32 shift_vec0 = reinterpret_cast(b.vec0()); + vuint32 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +} + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { @@ -292,8 +319,11 @@ Vectorized inline minimum( return a.minimum(b); } +<<<<<<< HEAD DEFINE_SHIFT_FUNCS(int32_t) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h index 653c277b7d033..ff1df97190265 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h @@ -119,7 +119,11 @@ class Vectorized { const Vectorized& a, const Vectorized& b, const Vectorized& mask) { +<<<<<<< HEAD // the mask used here returned by comparison of vec256 +======= + // the mask used here returned by comparision of vec256 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return { vec_sel(a._vec0, b._vec0, mask._vecb0), @@ -232,6 +236,29 @@ class Vectorized { }; template <> +<<<<<<< HEAD +======= +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + vuint64 shift_vec0 = reinterpret_cast(b.vec0()); + vuint64 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)}; +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + vuint64 shift_vec0 = reinterpret_cast(b.vec0()); + vuint64 shift_vec1 = reinterpret_cast(b.vec1()); + return Vectorized{ + vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)}; +} + +template <> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline maximum( const Vectorized& a, const Vectorized& b) { @@ -245,8 +272,11 @@ Vectorized inline minimum( return a.minimum(b); } +<<<<<<< HEAD DEFINE_SHIFT_FUNCS(int64_t) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> Vectorized C10_ALWAYS_INLINE operator+(const Vectorized& a, const Vectorized& b) { diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h index 7ca603c0b91df..95b7905203127 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h @@ -1,6 +1,9 @@ #pragma once #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -40,6 +43,7 @@ using vfloat32 = __attribute__((altivec(vector__))) float; using vfloat64 = __attribute__((altivec(vector__))) double; #endif +<<<<<<< HEAD inline auto make_vuint(vint8 v) { return reinterpret_cast(v); } @@ -53,6 +57,8 @@ inline auto make_vuint(vint64 v) { return reinterpret_cast(v); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if !defined(vec_float) C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) { vfloat32 vec_out; @@ -535,6 +541,7 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5}; const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757}; const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0}; +<<<<<<< HEAD template Vectorized VsxShiftRightArith( const Vectorized& a, @@ -571,6 +578,8 @@ Vectorized VsxShiftLeftArith( return VsxShiftLeftArith(a, b); \ } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace CPU_CAPABILITY } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h index efb97b3c614db..cd430913e695c 100644 --- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h +++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h @@ -1023,9 +1023,12 @@ struct Vectorized()>> { Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized log() const { return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h index 975b71ce9a867..2d796a84e28a0 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512.h @@ -397,7 +397,11 @@ inline Vectorized operator&&( const __m512i* other_ = reinterpret_cast(other.as_bytes()); __m512i out = _mm512_and_si512(*self_, *other_); Vectorized ret; +<<<<<<< HEAD // We do not have a constructor that takes __m512i, so we need to memcpy +======= + // We do not have a constructer that takes __m512i, so we need to memcpy +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::memcpy(ret, &out, ret.size() * sizeof(bool)); return ret; } diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h index 844b3b1fcc1e8..cd1b61eeb2e85 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h @@ -192,9 +192,13 @@ class Vectorized16 { static constexpr size_type size() { return 32; } +<<<<<<< HEAD Vectorized16() { values = _mm512_setzero_si512(); } +======= + Vectorized16() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized16(__m512i v) : values(v) {} Vectorized16(T val) { value_type uw = val.x; @@ -537,9 +541,12 @@ class Vectorized16 { Vectorized expm1() const { return map(Sleef_expm1f16_u10); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized exp_u20() const { return exp(); } diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h index 3776001fc8720..3d11a98ee0815 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h @@ -34,9 +34,13 @@ class Vectorized> { static constexpr size_type size() { return 4; } +<<<<<<< HEAD Vectorized() { values = _mm512_setzero_pd(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m512d v) : values(v) {} Vectorized(c10::complex val) { double real_value = val.real(); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h index d434b2a1e2070..bb91ac64c4549 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h @@ -34,9 +34,13 @@ class Vectorized> { static constexpr size_type size() { return 8; } +<<<<<<< HEAD Vectorized() { values = _mm512_setzero_ps(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m512 v) : values(v) {} Vectorized(c10::complex val) { float real_value = val.real(); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h index 438fd31e91618..4fcab45731748 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h @@ -34,9 +34,13 @@ class Vectorized { static constexpr size_type size() { return 8; } +<<<<<<< HEAD Vectorized() { values = _mm512_setzero_pd(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m512d v) : values(v) {} Vectorized(double val) { values = _mm512_set1_pd(val); @@ -223,9 +227,12 @@ class Vectorized { Vectorized exp_u20() const { return exp(); } +<<<<<<< HEAD Vectorized fexp_u20() const { return exp(); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized fmod(const Vectorized& q) const { return Vectorized(Sleef_fmodd8(values, q)); } @@ -537,6 +544,7 @@ Vectorized inline fmadd( } template <> +<<<<<<< HEAD Vectorized inline fnmadd( const Vectorized& a, const Vectorized& b, @@ -545,6 +553,8 @@ Vectorized inline fnmadd( } template <> +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, @@ -552,6 +562,7 @@ Vectorized inline fmsub( return _mm512_fmsub_pd(a, b, c); } +<<<<<<< HEAD template <> Vectorized inline fnmsub( const Vectorized& a, @@ -560,6 +571,8 @@ Vectorized inline fnmsub( return _mm512_fnmsub_pd(a, b, c); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif } // namespace CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h index 7a9e69b76c851..de0c9ea3fca26 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h @@ -32,9 +32,13 @@ class Vectorized { static constexpr size_type size() { return 16; } +<<<<<<< HEAD Vectorized() { values = _mm512_setzero_ps(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(__m512 v) : values(v) {} Vectorized(float val) { values = _mm512_set1_ps(val); @@ -312,6 +316,7 @@ class Vectorized { Vectorized expm1() const { return Vectorized(Sleef_expm1f16_u10(values)); } +<<<<<<< HEAD Vectorized fexp_u20() const { const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f); const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f); @@ -366,6 +371,8 @@ class Vectorized { // final interpretation to float return _mm512_castsi512_ps(casted_integer); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized exp_u20() const { // A faster version of exp with ULP=20 const __m512 vec_factorial_1 = @@ -750,6 +757,7 @@ Vectorized inline fmadd( } template <> +<<<<<<< HEAD Vectorized inline fnmadd( const Vectorized& a, const Vectorized& b, @@ -758,6 +766,8 @@ Vectorized inline fnmadd( } template <> +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline fmsub( const Vectorized& a, const Vectorized& b, @@ -765,6 +775,7 @@ Vectorized inline fmsub( return _mm512_fmsub_ps(a, b, c); } +<<<<<<< HEAD template <> Vectorized inline fnmsub( const Vectorized& a, @@ -773,6 +784,8 @@ Vectorized inline fnmsub( return _mm512_fnmsub_ps(a, b, c); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle) // Used by Inductor CPP codegen for micro gemm // Code referred to FBGEMM: diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h index 0a2f2c5f94823..2e29187a66165 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h @@ -53,9 +53,13 @@ class Vectorized : public Vectorizedi { return 8; } using Vectorizedi::Vectorizedi; +<<<<<<< HEAD Vectorized() { values = _mm512_setzero_si512(); } +======= + Vectorized() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized(int64_t v) { values = _mm512_set1_epi64(v); } @@ -1088,7 +1092,11 @@ class Vectorized8 : public Vectorizedi { // Because loadu(const void* ptr, T count) requires zero initialization for // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384 // bits of the result are undefined. +<<<<<<< HEAD // TODO We can use _mm512_zextsi128_si512 in the future, +======= + // TODO We can use _mm512_zextsi128_si512 in the furture, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // since gcc 9.3 doesn't support it now. __m128i input_128 = _mm_loadu_si128(reinterpret_cast(ptr)); return _mm512_castsi128_si512(input_128); @@ -1852,7 +1860,11 @@ Vectorized inline shift_512_8( // Control masks for shuffle operation, treating 512 bits as an // array of 8-bit elements, and considering pairs of neighboring +<<<<<<< HEAD // elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and +======= + // elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // M!=N) is set so that shuffle will move element with index M from // input pair into element with index N in output pair, and element // with index M in output pair will be set to all 0s. @@ -2022,7 +2034,11 @@ Vectorized inline shift_512_8( c0 = _mm512_srlv_epi16(a0, b0); c0 = _mm512_shuffle_epi8(c0, ctl_1_0); +<<<<<<< HEAD // Perform shifting the same way for input array elements with +======= + // Peform shifting the same way for input array elements with +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // idx%2==1. __m512i a1 = _mm512_and_si512(a, keep_1); __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h index 64ba47e0f0646..ba4fa82bbeed5 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h @@ -55,9 +55,13 @@ struct Vectorizedqi { #endif public: +<<<<<<< HEAD Vectorizedqi() { vals = _mm512_setzero_si512(); } +======= + Vectorizedqi() {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorizedqi(__m512i v) : vals(v) {} operator __m512i() const { return vals; @@ -125,24 +129,40 @@ typename std::enable_if_t< } template +<<<<<<< HEAD at::vec::Vectorized inline convert_float_to_int8( at::vec::Vectorized src); template <> at::vec::Vectorized inline convert_float_to_int8( at::vec::Vectorized src) { +======= +typename std::enable_if_t< + std::is_same_v || std::is_same_v, + at::vec::Vectorized< + T>> inline convert_float_to_int8(at::vec::Vectorized src) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Convert from float32 to int32 with truncation __m512i x_values_int32 = _mm512_cvttps_epi32(src); // Convert from int32 to int16 using signed saturation __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32); +<<<<<<< HEAD constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); // Convert from int16 to int8 using unsigned saturation __m512i xyzw_clamped_v = pack_saturate_and_clamp( xy_packed_v, xy_packed_v, min_val, max_val); +======= + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + + // Convert from int16 to uint8/int8 using unsigned saturation + __m512i xyzw_clamped_v = + pack_saturate_and_clamp(xy_packed_v, xy_packed_v, min_val, max_val); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m512i permute_mask_v = _mm512_set_epi32( 0x0f, 0x0b, @@ -163,6 +183,7 @@ at::vec::Vectorized inline convert_float_to_int8( return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v); } +<<<<<<< HEAD template <> at::vec::Vectorized inline convert_float_to_int8( at::vec::Vectorized src) { @@ -178,6 +199,8 @@ at::vec::Vectorized inline convert_float_to_int8( return _mm512_castsi128_si512(int8_src); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template __FORCE_INLINE void QuantizeAvx512( const float* src, diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h index b4441981b3d87..1b86794d8a336 100644 --- a/aten/src/ATen/cpu/vec/vec_base.h +++ b/aten/src/ATen/cpu/vec/vec_base.h @@ -238,6 +238,12 @@ struct Vectorized { Vectorized vector; int_same_size_t buffer[size()]; mask.store(buffer); +<<<<<<< HEAD +======= +#if defined(__clang__) && __ARM_FEATURE_SVE +#pragma clang loop vectorize(disable) +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(size())) { if (buffer[i] & 0x01) { vector[i] = b[i]; @@ -544,9 +550,12 @@ struct Vectorized { Vectorized exp_u20() const { return map(std::exp); } +<<<<<<< HEAD Vectorized fexp_u20() const { return map(std::exp); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized frac() const { return *this - this->trunc(); } @@ -634,7 +643,11 @@ struct Vectorized { } Vectorized neg() const { // NB: the trailing return type is needed because we need to coerce the +<<<<<<< HEAD // return value back to T in the case of unary operator- incurring a +======= + // return value back to T in the case of unary operator- incuring a +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // promotion return map([](T x) -> T { return -x; }); } @@ -1248,6 +1261,7 @@ inline Vectorized fmadd( VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd) template +<<<<<<< HEAD inline Vectorized fnmadd( const Vectorized& a, const Vectorized& b, @@ -1258,6 +1272,8 @@ inline Vectorized fnmadd( VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmadd) template +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) inline Vectorized fmsub( const Vectorized& a, const Vectorized& b, @@ -1268,6 +1284,7 @@ inline Vectorized fmsub( VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub) template +<<<<<<< HEAD inline Vectorized fnmsub( const Vectorized& a, const Vectorized& b, @@ -1278,6 +1295,8 @@ inline Vectorized fnmsub( VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmsub) template +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Vectorized inline operator&&( const Vectorized& a, const Vectorized& b) { diff --git a/aten/src/ATen/cpu/vec/vec_half.h b/aten/src/ATen/cpu/vec/vec_half.h index dc1c23c74ae52..2bf13659596c5 100644 --- a/aten/src/ATen/cpu/vec/vec_half.h +++ b/aten/src/ATen/cpu/vec/vec_half.h @@ -3,12 +3,58 @@ #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { +<<<<<<< HEAD +======= +#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ + !defined(__APPLE__) +static inline uint16_t float2half_scalar(float val) { +#if defined(CPU_CAPABILITY_AVX2) +#if defined(_MSC_VER) + __m256 v = _mm256_set1_ps(val); + __m128i o = + _mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + return static_cast(_mm_cvtsi128_si32(o)); +#else + return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#endif +#elif defined(CPU_CAPABILITY_AVX512) + __m512 v = _mm512_set1_ps(val); + __m256i o = + _mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + return static_cast( + _mm_cvtsi128_si32(_mm256_castsi256_si128(o))); +#endif +} + +static inline float half2float_scalar(uint16_t val) { +#if defined(CPU_CAPABILITY_AVX2) +#if defined(_MSC_VER) + __m128i v = _mm_cvtsi32_si128(val); + __m256 o = _mm256_cvtph_ps(v); + return _mm256_cvtss_f32(o); +#else + return _cvtsh_ss(val); +#endif +#elif defined(CPU_CAPABILITY_AVX512) + __m256i v = + _mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + __m512 o = _mm512_cvtph_ps(v); + return _mm512_cvtss_f32(o); +#endif +} + +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Transpose a [2, 32] matrix to [32, 2] // Note: the output leading dimension should be 2, // that is, the output must be contiguous diff --git a/aten/src/ATen/cpu/vec/vec_n.h b/aten/src/ATen/cpu/vec/vec_n.h index 3de55de6f1b85..93c61ad3a44b4 100644 --- a/aten/src/ATen/cpu/vec/vec_n.h +++ b/aten/src/ATen/cpu/vec/vec_n.h @@ -263,7 +263,10 @@ class VectorizedN { VECTORIZEDN_DEFINE_UNARY_OP(exp2) VECTORIZEDN_DEFINE_UNARY_OP(expm1) VECTORIZEDN_DEFINE_UNARY_OP(exp_u20) +<<<<<<< HEAD VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VECTORIZEDN_DEFINE_UNARY_OP(frac) VECTORIZEDN_DEFINE_BINARY_OP(fmod) VECTORIZEDN_DEFINE_UNARY_OP(log) diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index aaed431064611..a03f958654fb7 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -16,10 +16,14 @@ #include #include +<<<<<<< HEAD #include #ifdef USE_ROCM #include +======= +#ifdef USE_ROCM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include // until hipblas has an API to accept flags, we must use rocblas here #include @@ -110,7 +114,11 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error) namespace { +<<<<<<< HEAD cublasOperation_t _cublasOpFromChar(char op) { +======= +static cublasOperation_t _cublasOpFromChar(char op) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTNEXTLINE(bugprone-switch-missing-default-case) switch (op) { case 'n': @@ -130,7 +138,11 @@ cublasOperation_t _cublasOpFromChar(char op) { "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); } +<<<<<<< HEAD void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { +======= +static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note: leading dimensions generally are checked that they are > 0 // and at least as big the result requires (even if the value won't // be used). @@ -144,7 +156,11 @@ void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { *lda = std::max(m, 1); } +<<<<<<< HEAD void _cublasAdjustLdLevel3( +======= +static void _cublasAdjustLdLevel3( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) char transa, char transb, int64_t m, @@ -191,6 +207,7 @@ uint32_t _getAlignment(uintptr_t address) { } #endif +<<<<<<< HEAD #ifdef USE_ROCM static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) { // 0 is default value, meaning full CUs i.e. no mask @@ -249,6 +266,8 @@ static void _syncCurrentWithCarveoutStream(hipStream_t stream, bool presync) { } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) struct CublasLtWorkspace { CublasLtWorkspace() { size = at::cuda::getCUDABlasLtWorkspaceSize(); @@ -325,7 +344,11 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< descriptor_.reset(raw_descriptor); } template +<<<<<<< HEAD void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { +======= + inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTNEXTLINE(bugprone-sizeof-expression) TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value))); } @@ -347,7 +370,11 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< descriptor_.reset(raw_descriptor); } template +<<<<<<< HEAD void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { +======= + inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T))); } }; @@ -362,7 +389,11 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< descriptor_.reset(raw_descriptor); } template +<<<<<<< HEAD void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { +======= + inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T))); } }; @@ -397,7 +428,11 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; } else if constexpr (std::is_same_v) { +<<<<<<< HEAD if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { +======= + if (at::globalContext().allowTF32CuBLAS()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) computeType = CUBLAS_COMPUTE_32F_FAST_TF32; } } else if constexpr (std::is_same_v>) { @@ -424,6 +459,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D abType = CUDA_R_16F; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16F; #ifndef USE_ROCM +<<<<<<< HEAD auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); if (fp16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { @@ -435,12 +471,18 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D : CUBLASLT_REDUCTION_SCHEME_NONE; preference.setAttribute( CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); +======= + if (!at::globalContext().allowFP16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif } else if constexpr (std::is_same_v) { abType = CUDA_R_16BF; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16BF; #ifndef USE_ROCM +<<<<<<< HEAD auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); if (bf16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { @@ -452,12 +494,21 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D : CUBLASLT_REDUCTION_SCHEME_NONE; preference.setAttribute( CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); +======= + if (!at::globalContext().allowBF16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif } else { static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented"); } +<<<<<<< HEAD +======= + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -466,7 +517,10 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, opa); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, opb); +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -474,12 +528,15 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( at::globalContext()._SMCarveout_EXPERIMENTAL().value()); _syncCurrentWithCarveoutStream(stream, true); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T); CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T); @@ -542,12 +599,16 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D &heuristicResult.algo, ltworkspace.ptr, ltworkspace.size, +<<<<<<< HEAD stream); #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { _syncCurrentWithCarveoutStream(stream, false); } #endif +======= + at::cuda::getCurrentCUDAStream()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (cublasStatus != CUBLAS_STATUS_SUCCESS) { TORCH_WARN( @@ -591,6 +652,11 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D template <> void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(double)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -602,6 +668,11 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(double)) { template <> void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(float)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -613,6 +684,11 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(float)) { template <> void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -626,6 +702,11 @@ void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::co template <> void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -639,6 +720,11 @@ void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::com template inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -655,8 +741,11 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP void * beta_ptr = &fbeta; #ifdef USE_ROCM int flag = 0; +<<<<<<< HEAD rocblas_datatype c_type = std::is_same::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r; rocblas_datatype d_type = c_type; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if USE_GEMM_FLAGS_FP16_ALT_IMPL flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; #endif @@ -665,8 +754,13 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP hipOperationToRocOperation(opb), (int)m, (int)n, (int)k, (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea, b, rocblas_datatype_f16_r, (int)ldb, strideb, +<<<<<<< HEAD (void*)beta_ptr, c, c_type, (int)ldc, stridec, c, d_type, (int)ldc, stridec, +======= + (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec, + c, rocblas_datatype_f16_r, (int)ldc, stridec, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, flag))); #else @@ -710,6 +804,11 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP template inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) BGEMM_CHECK_ARGVALUES(at::BFloat16); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); @@ -843,7 +942,11 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); } } +<<<<<<< HEAD #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); } @@ -1007,6 +1110,12 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { +<<<<<<< HEAD +======= + #ifdef USE_ROCM + TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm"); + #endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO: Support tuning for Half inputs and FP32 output bgemm_internal(CUDABLAS_BGEMM_ARGS(at::Half)); } @@ -1014,7 +1123,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float) template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { +<<<<<<< HEAD #ifndef USE_ROCM +======= + #ifdef USE_ROCM + TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm"); + #else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major < 8) @@ -1033,6 +1148,11 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(double)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1044,6 +1164,11 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(double)) { template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(float)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1055,6 +1180,11 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1068,6 +1198,11 @@ void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::comp template <> void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1081,6 +1216,11 @@ void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::compl template inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1097,8 +1237,11 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( GEMM_CHECK_ARGVALUES(at::Half); #ifdef USE_ROCM int flag = 0; +<<<<<<< HEAD rocblas_datatype c_type = std::is_same::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r; rocblas_datatype d_type = c_type; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if USE_GEMM_FLAGS_FP16_ALT_IMPL flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; #endif @@ -1118,10 +1261,17 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( ldb, beta_ptr, c, +<<<<<<< HEAD c_type, ldc, c, d_type, +======= + rocblas_datatype_f16_r, + ldc, + c, + rocblas_datatype_f16_r, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ldc, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, @@ -1138,6 +1288,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( } if (prop->major >= 5) { cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; +<<<<<<< HEAD auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); TORCH_CHECK(fp16_reduction != at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK, @@ -1147,6 +1298,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { cublas_flags = static_cast( cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); +======= + if (!at::globalContext().allowFP16ReductionCuBLAS()) { + cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Disallow fp16 reductions that could lead to unexpected overflow issues. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags)); @@ -1196,6 +1351,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( template inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) { +<<<<<<< HEAD +======= + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1205,6 +1364,7 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT GEMM_CHECK_ARGVALUES(at::BFloat16); #ifndef USE_ROCM cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; +<<<<<<< HEAD auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); TORCH_CHECK(bf16_reduction != at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK, @@ -1214,6 +1374,10 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { cublas_flags = static_cast( cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); +======= + if (!at::globalContext().allowBF16ReductionCuBLAS()) { + cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif #if defined(USE_ROCM) @@ -1284,7 +1448,11 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); #endif } +<<<<<<< HEAD #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); } @@ -1300,9 +1468,15 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); } +<<<<<<< HEAD #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); } else{ at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); @@ -1352,7 +1526,11 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); } +<<<<<<< HEAD #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); } @@ -1368,7 +1546,11 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); } +<<<<<<< HEAD #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) +======= +#if defined(USE_ROCM) && !defined(_MSC_VER) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); } @@ -1524,6 +1706,12 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { +<<<<<<< HEAD +======= + #ifdef USE_ROCM + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + #endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO: Support Tuning for fp16-fp32 gemm gemm_internal(CUDABLAS_GEMM_ARGS(at::Half)); } @@ -1531,7 +1719,13 @@ void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) template <> void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { +<<<<<<< HEAD #ifndef USE_ROCM +======= + #ifdef USE_ROCM + TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm"); + #else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major < 8) @@ -1591,7 +1785,11 @@ bool gemm_and_bias( computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; } else if constexpr (std::is_same_v) { +<<<<<<< HEAD if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { +======= + if (at::globalContext().allowTF32CuBLAS()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) computeType = CUBLAS_COMPUTE_32F_FAST_TF32; } } else if constexpr (std::is_same_v) { @@ -1609,6 +1807,7 @@ bool gemm_and_bias( abType = CUDA_R_16F; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16F; #ifndef USE_ROCM +<<<<<<< HEAD auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); if (fp16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { @@ -1620,12 +1819,18 @@ bool gemm_and_bias( : CUBLASLT_REDUCTION_SCHEME_NONE; preference.setAttribute( CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); +======= + if (!at::globalContext().allowFP16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif } else if constexpr (std::is_same_v) { abType = CUDA_R_16BF; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16BF; #ifndef USE_ROCM +<<<<<<< HEAD auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); if (bf16_reduction != at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { @@ -1637,6 +1842,11 @@ bool gemm_and_bias( : CUBLASLT_REDUCTION_SCHEME_NONE; preference.setAttribute( CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); +======= + if (!at::globalContext().allowBF16ReductionCuBLAS()) { + preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, + CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif } @@ -1646,7 +1856,10 @@ bool gemm_and_bias( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa); cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N; computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb); +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -1654,18 +1867,27 @@ bool gemm_and_bias( at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( at::globalContext()._SMCarveout_EXPERIMENTAL().value()); _syncCurrentWithCarveoutStream(stream, true); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; if (activation == GEMMAndBiasActivationEpilogue::RELU) { epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { +<<<<<<< HEAD + epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; +======= +#if CUDA_VERSION >= 11040 || defined(USE_ROCM) epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (bias != nullptr) { @@ -1726,12 +1948,16 @@ bool gemm_and_bias( &heuristicResult.algo, ltworkspace.ptr, ltworkspace.size, +<<<<<<< HEAD stream); #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { _syncCurrentWithCarveoutStream(stream, false); } #endif +======= + at::cuda::getCurrentCUDAStream()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (cublasStatus != CUBLAS_STATUS_SUCCESS) { TORCH_WARN( @@ -1863,6 +2089,7 @@ template bool gemm_and_bias( int64_t result_ld, GEMMAndBiasActivationEpilogue activation); +<<<<<<< HEAD using at::blas::ScalingType; int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) { @@ -1932,6 +2159,8 @@ case ScalingType::TensorWise: } } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void scaled_gemm( char transa, char transb, @@ -1943,13 +2172,19 @@ void scaled_gemm( int64_t mat1_ld, ScalarType mat1_dtype, ScalarType mat1_scale_dtype, +<<<<<<< HEAD ScalingType mat1_scaling_type, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const void* mat2_ptr, const void* mat2_scale_ptr, int64_t mat2_ld, ScalarType mat2_dtype, ScalarType mat2_scale_dtype, +<<<<<<< HEAD ScalingType mat2_scaling_type, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const void* bias_ptr, ScalarType bias_dtype, void* result_ptr, @@ -1957,6 +2192,7 @@ void scaled_gemm( int64_t result_ld, ScalarType result_dtype, bool use_fast_accum, +<<<<<<< HEAD const std::optional& alpha) { // Note: see `cublasCommonArgs` for various non-intuitive manipulations // of input arguments to this function. @@ -1965,20 +2201,38 @@ void scaled_gemm( // Note: alpha_val may change later depending on user-passed argument float alpha_val = 1.0; float beta_val = 0.0; +======= + bool use_rowwise) { + // Note: see `cublasCommonArgs` for various non-intuitive manupulations + // of input arguments to this function. +#if CUDA_VERSION >= 11080 || defined(USE_ROCM) + const auto computeType = CUBLAS_COMPUTE_32F; + const auto scaleType = CUDA_R_32F; + const float alpha_val = 1.0; + const float beta_val = 0.0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa)); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb)); cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER; cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER; +<<<<<<< HEAD #if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) // hipblaslt supported row-wise before cublas, and did so their own way (via // the SCALE_POINTERSs), but then migrated to match how cublas does it (via // the SCALE_MODEs). Here we check for this early custom mode. bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise); +======= +#if defined(USE_ROCM) +#if defined(HIPBLASLT_OUTER_VEC) + // this case is handled later as hipified CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F +#elif defined(HIPBLASLT_VEC_EXT) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (use_rowwise) { matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT; matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT; } +<<<<<<< HEAD else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) { #if ROCM_VERSION >= 70000 if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) { @@ -1997,12 +2251,32 @@ void scaled_gemm( // rowwise isn't supported using older cublaslt or older hipblaslt TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt"); #endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) +======= + else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) { +#if ROCM_VERSION >= 70000 + if (at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"})) { + // Validate matrix dimensions for MX format + TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0), + "Matrix dimensions must be multiples of 32 for MX format. ", + "Got m=", m, ", n=", n, ", k=", k); + } +#endif + } +#else + // rowwise isn't supported using older hipblaslt + TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt"); +#endif +#endif // defined(USE_ROCM) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) computeDesc.setAttribute(matmulDescA, mat1_scale_ptr); computeDesc.setAttribute(matmulDescB, mat2_scale_ptr); if (result_scale_ptr != nullptr) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); } +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -2010,12 +2284,15 @@ void scaled_gemm( at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( at::globalContext()._SMCarveout_EXPERIMENTAL().value()); _syncCurrentWithCarveoutStream(stream, true); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // ifndef USE_ROCM #ifndef USE_ROCM const int8_t fastAccuMode = use_fast_accum ? 1 : 0; @@ -2036,6 +2313,7 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype)); } +<<<<<<< HEAD // Handle user-passed alpha float *alpha_ptr = &alpha_val; float *beta_ptr = &beta_val; @@ -2073,13 +2351,44 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode); #endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC)) +======= + if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) { +#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000) + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0); +#else + TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above"); +#endif // if CUDA_VERSION >= 12080 + } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) { +#if CUDA_VERSION >= 12080 + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3); +#else + TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above"); +#endif // if CUDA_VERSION >= 12080 + } else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) { +#if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC)) + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); +#elif defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT) + // no-op here for older hipblaslt ext enums, to avoid TORCH_CHECK below +#else + TORCH_CHECK(false, "scaled_gemm with `torch.float` outer vector scaling is only supported for CUDA 12.9 and above"); +#endif // if CUDA_VERSION >= 12090 + } + + auto stream = c10::cuda::getCurrentCUDAStream(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CuBlasLtMatmulPreference preference; auto ltworkspace = CublasLtWorkspace(); preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size); cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( ltHandle, computeDesc.descriptor(), @@ -2120,10 +2429,17 @@ void scaled_gemm( auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported( ltHandle, computeDesc.descriptor(), +<<<<<<< HEAD alpha_ptr, Adesc.descriptor(), Bdesc.descriptor(), beta_ptr, +======= + &alpha_val, + Adesc.descriptor(), + Bdesc.descriptor(), + &beta_val, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Cdesc.descriptor(), Ddesc.descriptor(), all_algos[i].algo, @@ -2142,14 +2458,27 @@ void scaled_gemm( cublasStatus_t cublasStatus = cublasLtMatmul( ltHandle, computeDesc.descriptor(), +<<<<<<< HEAD alpha_ptr, +======= + &alpha_val, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mat1_ptr, Adesc.descriptor(), mat2_ptr, Bdesc.descriptor(), +<<<<<<< HEAD beta_ptr, // NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr +======= + &beta_val, +#ifdef USE_ROCM + result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr +#else + nullptr, +#endif // ifdef USE_ROCM +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Cdesc.descriptor(), result_ptr, Ddesc.descriptor(), @@ -2157,11 +2486,14 @@ void scaled_gemm( ltworkspace.ptr, ltworkspace.size, stream); +<<<<<<< HEAD #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { _syncCurrentWithCarveoutStream(stream, false); } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, "CUDA error: ", @@ -2187,6 +2519,11 @@ void scaled_gemm( " scaleType ", scaleType); return; +<<<<<<< HEAD +======= +#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM) + TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void int8_gemm( @@ -2213,7 +2550,10 @@ void int8_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa); cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N; computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb); +<<<<<<< HEAD auto stream = at::cuda::getCurrentCUDAStream(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { computeDesc.setAttribute( @@ -2221,12 +2561,15 @@ void int8_gemm( at::cuda::getCurrentDeviceProperties()->multiProcessorCount - at::globalContext()._SMCarveout_EXPERIMENTAL().value()); } +<<<<<<< HEAD #else if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { stream = _getCarveoutStream( at::globalContext()._SMCarveout_EXPERIMENTAL().value()); _syncCurrentWithCarveoutStream(stream, true); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1); @@ -2288,7 +2631,11 @@ void int8_gemm( #else 0, #endif +<<<<<<< HEAD stream); +======= + at::cuda::getCurrentCUDAStream()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, "CUDA error: ", @@ -2317,11 +2664,14 @@ void int8_gemm( computeType, " scaleType ", scaleType); +<<<<<<< HEAD #ifdef USE_ROCM if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) { _syncCurrentWithCarveoutStream(stream, false); } #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> @@ -2461,6 +2811,11 @@ void trsmBatched>( template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2476,6 +2831,11 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // gemv is bw bound, and does not benefit from TF32. But the precision // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2488,6 +2848,11 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { template <> void gemv(CUDABLAS_GEMV_ARGTYPES(double)) { +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2501,6 +2866,11 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(float)) { // gemv is bw bound, and does not benefit from TF32. But the precision // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; +<<<<<<< HEAD +======= + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2625,6 +2995,11 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)) { reinterpret_cast(result))); } +<<<<<<< HEAD +======= +// HIP on Windows does not support +#if !(defined(USE_ROCM) && defined(_MSC_VER)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template <> void getrsBatched(CUDABLAS_GETRS_ARGTYPES(float)) { TORCH_CUDABLAS_CHECK(cublasSgetrsBatched( @@ -2823,5 +3198,9 @@ void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple devInfoArray, batchSize)); } +<<<<<<< HEAD +======= +#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index 0295948311a59..6e7003191533e 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -14,7 +14,10 @@ */ #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include namespace at::cuda::blas { @@ -148,13 +151,19 @@ void scaled_gemm( int64_t mat1_ld, ScalarType mat1_dtype, ScalarType mat1_scale_dtype, +<<<<<<< HEAD at::blas::ScalingType mat1_scaling_type, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const void* mat2_ptr, const void* mat2_scale_ptr, int64_t mat2_ld, ScalarType mat2_dtype, ScalarType mat2_scale_dtype, +<<<<<<< HEAD at::blas::ScalingType mat2_scaling_type, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const void* bias_ptr, ScalarType bias_dtype, void* result_ptr, @@ -162,7 +171,11 @@ void scaled_gemm( int64_t result_ld, ScalarType result_dtype, bool use_fast_accum, +<<<<<<< HEAD const std::optional& alpha); +======= + bool use_rowwise); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype) @@ -336,6 +349,12 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)); int m, int n, int nrhs, Dtype** dA_array, int ldda, \ Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize +<<<<<<< HEAD +======= +// HIP on Windows does not support getrs, geqrf, getrf, gels +#if !(defined(USE_ROCM) && defined(_MSC_VER)) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented"); @@ -390,4 +409,31 @@ TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_A template<> TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex)); +<<<<<<< HEAD +======= +#else // !(defined(USE_ROCM) && defined(_MSC_VER)) + +template +void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows"); +} + +template +void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows"); +} + +template +void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows"); +} + +template +void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) { + TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows"); +} + +#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index 81b4643ac0418..f28cb7b69ccc4 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -2,10 +2,17 @@ #include #include +<<<<<<< HEAD #include #include #include #include +======= +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -246,6 +253,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { } }; +<<<<<<< HEAD // EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate // calls. cudaEventCreate when concurrently invoked from multiple threads can be // very expensive (especially on certain device/driver combinations). @@ -321,4 +329,6 @@ class EventPool { std::vector pools_; }; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index 2e387fbc264d7..545ebbc3c20b8 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -15,6 +15,7 @@ namespace cuda::detail { namespace { // Total number of gpus in the system. +<<<<<<< HEAD int64_t num_gpus; // Ensures default_gens_cuda is initialized once. @@ -22,12 +23,25 @@ std::deque cuda_gens_init_flag; // Default, global CUDA generators, one per GPU. std::vector default_gens_cuda; +======= +static int64_t num_gpus; + +// Ensures default_gens_cuda is initialized once. +static std::deque cuda_gens_init_flag; + +// Default, global CUDA generators, one per GPU. +static std::vector default_gens_cuda; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* * Populates the global variables related to CUDA generators * Warning: this function must only be called once! */ +<<<<<<< HEAD void initCUDAGenVector() { +======= +static void initCUDAGenVector() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Ensures we only call cudaGetDeviceCount only once. static bool num_gpu_init_flag [[maybe_unused]] = []() { num_gpus = static_cast(c10::cuda::device_count()); @@ -109,7 +123,11 @@ void CUDAGeneratorState::increase(uint64_t increment) { offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4."); // Ensures the increment does not cause overflow. TORCH_INTERNAL_ASSERT( +<<<<<<< HEAD offset_intragraph_ <= std::numeric_limits::max() - increment, +======= + offset_intragraph_ <= std::numeric_limits::max() - increment, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "Increment causes overflow in the offset value."); offset_intragraph_ += increment; } else { @@ -266,6 +284,7 @@ CUDAGeneratorImpl::CUDAGeneratorImpl( * See Note [Acquire lock when using random generators] */ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { +<<<<<<< HEAD if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { state_->seed_ = seed; state_->philox_offset_per_thread_ = 0; @@ -274,6 +293,13 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed."); // no-op case } +======= + at::cuda::assertNotCapturing( + "Cannot call CUDAGeneratorImpl::set_current_seed"); + state_->seed_ = seed; + state_->philox_offset_per_thread_ = 0; + no_reset_rnn_state_.clear(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /** @@ -302,6 +328,12 @@ uint64_t CUDAGeneratorImpl::get_offset() const { * Gets the current seed of CUDAGeneratorImpl. */ uint64_t CUDAGeneratorImpl::current_seed() const { +<<<<<<< HEAD +======= + // Debatable if current_seed() should be allowed in captured regions. + // Conservatively disallow it for now. + at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return state_->seed_; } @@ -325,9 +357,15 @@ uint64_t CUDAGeneratorImpl::seed() { */ c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { // The RNG state comprises the seed, and an offset used for Philox. +<<<<<<< HEAD constexpr size_t seed_size = sizeof(uint64_t); constexpr size_t offset_size = sizeof(int64_t); constexpr size_t total_size = seed_size + offset_size; +======= + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = seed_size + offset_size; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt); auto rng_state = state_tensor.data_ptr(); @@ -346,9 +384,17 @@ c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { * and size of the internal state. */ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { +<<<<<<< HEAD constexpr size_t seed_size = sizeof(uint64_t); constexpr size_t offset_size = sizeof(int64_t); constexpr size_t total_size = seed_size + offset_size; +======= + at::cuda::assertNotCapturing( + "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing."); + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = seed_size + offset_size; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) detail::check_rng_state(new_state); @@ -400,6 +446,7 @@ c10::intrusive_ptr CUDAGeneratorImpl::graphsafe_get_state() */ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { // see Note [Why enforce RNG offset % 4 == 0?] +<<<<<<< HEAD // Note: If you use CUDNN RNN's, calling // set_philox_offset_per_thread instead of set_offset will cause the @@ -410,17 +457,25 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { } else { state_->offset_intragraph_ = offset; } +======= + TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4"); + state_->philox_offset_per_thread_ = offset; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /** * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. */ uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { +<<<<<<< HEAD if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { return state_->philox_offset_per_thread_; } else { return state_->offset_intragraph_; } +======= + return state_->philox_offset_per_thread_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /** @@ -461,7 +516,11 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) { */ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) { if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) { +<<<<<<< HEAD uint64_t offset = state_->offset_intragraph_; +======= + uint32_t offset = state_->offset_intragraph_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) state_->increase(increment); return PhiloxCudaState( state_->seed_extragraph_.data_ptr(), diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h index d4ab49382e7ff..063579427a3b3 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h @@ -96,16 +96,28 @@ struct CUDAGraph; struct CUDAGeneratorState : public c10::intrusive_ptr_target { uint64_t seed_; uint64_t philox_offset_per_thread_; +<<<<<<< HEAD uint64_t offset_intragraph_; bool capturing_{}; std::unordered_set registered_graphs_; at::TensorBase seed_extragraph_; at::TensorBase offset_extragraph_; +======= + uint32_t offset_intragraph_; + bool capturing_{}; + std::unordered_set registered_graphs_; + at::TensorBase seed_extragraph_{}; + at::TensorBase offset_extragraph_{}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUDAGeneratorState( uint64_t seed = default_rng_seed_val, uint64_t philox_offset_per_thread = 0, +<<<<<<< HEAD uint64_t offset_intragraph = 0) +======= + uint32_t offset_intragraph = 0) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) : seed_(seed), philox_offset_per_thread_(philox_offset_per_thread), offset_intragraph_(offset_intragraph) {} @@ -167,7 +179,11 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { CUDAGeneratorImpl* clone_impl() const override; c10::intrusive_ptr state_; +<<<<<<< HEAD std::atomic_flag no_reset_rnn_state_; +======= + std::atomic_flag no_reset_rnn_state_{}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; namespace cuda::detail { diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 31d2d3f1fe589..a7bb66970e2cb 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -2,6 +2,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -168,9 +172,17 @@ void CUDAGraph::instantiate() { // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597 // cudaGraphInstantiateWithFlags // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233 +<<<<<<< HEAD int version = 0; AT_CUDA_CHECK(cudaDriverGetVersion(&version)); if (version < 11040) { +======= +#if !defined(USE_ROCM) || ROCM_VERSION >= 60200 + int version = 0; + AT_CUDA_CHECK(cudaDriverGetVersion(&version)); + if (version < 11040) { +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people, // who prefer not to report error message through these arguments moving forward // (they prefer return value, or errors on api calls internal to the capture) @@ -181,11 +193,19 @@ void CUDAGraph::instantiate() { #endif //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory. //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch. +<<<<<<< HEAD +======= +#if !defined(USE_ROCM) || ROCM_VERSION >= 60200 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_, graph_, cudaGraphInstantiateFlagAutoFreeOnLaunch)); } +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) has_graph_exec_ = true; } @@ -248,6 +268,7 @@ cudaGraph_t CUDAGraph::raw_cuda_graph() { return graph_; } +<<<<<<< HEAD cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() { TORCH_CHECK( has_graph_exec_, @@ -255,6 +276,8 @@ cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() { return graph_exec_; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void CUDAGraph::reset() { // I'd prefer these checks throw exceptions, not print warnings, // but the destructor calls reset(), and at least one CI build @@ -307,7 +330,11 @@ CUDAGraph::~CUDAGraph() { // There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory. // They wait for next sync point in order to free the memory, this is to ensure that all // hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2. +<<<<<<< HEAD // We need to ensure all async operations finish before deleting the object. +======= +// We need to ensure all async opreations finish before deleting the object. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if (defined(USE_ROCM) && ROCM_VERSION >= 60200) if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id { diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index a32e7b4b86f07..075e6ed31bc68 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -2,7 +2,10 @@ #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -37,7 +40,10 @@ struct TORCH_CUDA_CPP_API CUDAGraph { void enable_debug_mode(); void debug_dump(const std::string& debug_path); cudaGraph_t raw_cuda_graph(); +<<<<<<< HEAD cudaGraphExec_t raw_cuda_graph_exec(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) protected: cudaGraph_t graph_ = nullptr; @@ -56,7 +62,11 @@ struct TORCH_CUDA_CPP_API CUDAGraph { // the ID assigned by cuda during graph capture, // used to identify when a stream is participating in capture +<<<<<<< HEAD CaptureId_t capture_id_ = 0; +======= + CaptureId_t capture_id_ = -1; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // uuid used to request a particular private mempool from CUDACachingAllocator. // By default, this will be set to {id_, 0}. diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h index e00e50b38d2de..380769f9076e0 100644 --- a/aten/src/ATen/cuda/CUDASparse.h +++ b/aten/src/ATen/cuda/CUDASparse.h @@ -6,15 +6,54 @@ #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch) #endif +<<<<<<< HEAD +======= +// cuSparse Generic API added in CUDA 10.1 +// Windows support added in CUDA 11.0 +#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32))) +#define AT_USE_CUSPARSE_GENERIC_API() 1 +#else +#define AT_USE_CUSPARSE_GENERIC_API() 0 +#endif + +// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0 +#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \ + (CUSPARSE_VERSION < 12000) +#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1 +#else +#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0 +#endif + +#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \ + (CUSPARSE_VERSION >= 12000) +#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1 +#else +#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0 +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(USE_ROCM) // hipSparse const API added in v2.4.0 #if HIPSPARSE_VERSION >= 200400 +<<<<<<< HEAD +#define AT_USE_HIPSPARSE_GENERIC_API() 1 +#else +#define AT_USE_HIPSPARSE_GENERIC_API() 1 +#endif +#else // USE_ROCM +======= +#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1 +#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 #define AT_USE_HIPSPARSE_GENERIC_API() 1 #else +#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 +#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1 #define AT_USE_HIPSPARSE_GENERIC_API() 1 #endif #else // USE_ROCM +#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 +#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define AT_USE_HIPSPARSE_GENERIC_API() 0 #endif // USE_ROCM diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp index d5f04df55f9c2..742398dbdb8fb 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp @@ -12,6 +12,11 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) { return cusparseDestroyDnMat(const_cast(dnMatDescr)); } +<<<<<<< HEAD +======= +#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { // If a specific GPU model does not provide native support for a given data @@ -208,4 +213,9 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6 descriptor_.reset(raw_descriptor); } +<<<<<<< HEAD +======= +#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda::sparse diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h index f12ef628e13f5..98547c4879396 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.h +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h @@ -35,6 +35,10 @@ class CuSparseDescriptor { std::unique_ptr> descriptor_; }; +<<<<<<< HEAD +======= +#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template struct ConstCuSparseDescriptorDeleter { void operator()(T* x) { @@ -57,6 +61,10 @@ class ConstCuSparseDescriptor { protected: std::unique_ptr> descriptor_; }; +<<<<<<< HEAD +======= +#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if defined(USE_ROCM) using cusparseMatDescr = std::remove_pointer_t; @@ -121,8 +129,44 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE +<<<<<<< HEAD cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); +======= +#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() + +cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); + +#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() +class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor + : public CuSparseDescriptor { + public: + explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); +}; + +class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor + : public CuSparseDescriptor { + public: + explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); + cusparseDnMatDescr* unsafe_mutable_descriptor() const { + return const_cast(descriptor()); + } + cusparseDnMatDescr* unsafe_mutable_descriptor() { + return const_cast(descriptor()); + } +}; + +class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor + : public CuSparseDescriptor { + public: + explicit CuSparseDnVecDescriptor(const Tensor& input); +}; + +class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor + : public CuSparseDescriptor {}; + +#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor : public ConstCuSparseDescriptor< cusparseDnMatDescr, @@ -161,6 +205,10 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); : public ConstCuSparseDescriptor< cusparseSpMatDescr, &cusparseDestroySpMat> {}; +<<<<<<< HEAD +======= +#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor : public CuSparseSpMatDescriptor { @@ -249,4 +297,9 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor } }; +<<<<<<< HEAD +======= +#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda::sparse diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 5786e87dac519..53f3aa74feeec 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -9,6 +9,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::cuda { namespace { @@ -71,6 +75,7 @@ using Block = HostBlock; struct CUDACachingHostAllocatorImpl : public CachingHostAllocatorImpl { private: +<<<<<<< HEAD ska::flat_hash_map use_host_register; void allocate_host_memory(size_t size, void** ptr) override { @@ -85,6 +90,11 @@ struct CUDACachingHostAllocatorImpl } void allocate_host_memory_slowpath(size_t size, void** ptr) { +======= + std::unordered_map use_host_register; + + void allocate_host_memory(size_t size, void** ptr) override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Pinned memory pointers allocated by any device can be directly used by // any other device, regardless of the current device at the time of // allocation, since we assume unified addressing. So we grab any existing @@ -123,6 +133,7 @@ struct CUDACachingHostAllocatorImpl } void free_block(Block* block) override { +<<<<<<< HEAD // We never free blocks from the reserve segment if (get_reserve_segment().initialized()) { // Check if the block is from the reserve segment @@ -138,6 +149,11 @@ struct CUDACachingHostAllocatorImpl auto start = std::chrono::steady_clock::now(); // Users may change the allocator config at will. torch unit tests do this. // However, allocations using cudaHostRegister should use corresponding +======= + auto start = std::chrono::steady_clock::now(); + // Users may change the allocator config at will. torch unit tests do this. + // However, allocations using cudaHostRegister should use corresonding +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost. void* ptr = block->ptr_; bool use_register = false; @@ -183,12 +199,21 @@ struct CUDACachingHostAllocatorImpl return true; } +<<<<<<< HEAD +======= + bool pinned_use_background_threads() override { + return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: + pinned_use_background_threads(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) EventPool::Event create_event_internal(DeviceIndex idx) { // Leak the event pool to avoid shutdown issue. static auto* event_pool = new EventPool(); return event_pool->get(idx); } +<<<<<<< HEAD PinnedReserveSegment& get_reserve_segment() { static auto reserve_segment = [&]() { if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) { @@ -203,6 +228,8 @@ struct CUDACachingHostAllocatorImpl return reserve_segment; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TaskThreadPool* getThreadPool() { static TaskThreadPool* pool = new TaskThreadPool( static_cast(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: @@ -217,15 +244,24 @@ struct CUDACachingHostAllocatorImpl size_t numThreads, size_t pageSize) { uintptr_t start = (uintptr_t)ptr + (size * i / numThreads); +<<<<<<< HEAD uintptr_t end = start + (size / numThreads); +======= + uintptr_t end = (uintptr_t)start + (size / numThreads); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (i == (numThreads - 1)) { end = (uintptr_t)ptr + size; } // pre-fault/map the pages by setting the first byte of the page uintptr_t alignedStart = +<<<<<<< HEAD ((start + pageSize - 1) & ~(pageSize - 1)); for (uintptr_t p = alignedStart; p < (end); p += pageSize) { +======= + (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1)); + for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTNEXTLINE(performance-no-int-to-ptr) memset((void*)p, 0, 1); } @@ -289,7 +325,11 @@ DECLARE_HOST_ALLOCATOR( CUDACachingHostAllocator, CUDACachingHostAllocatorImpl, raw_local_deleter, +<<<<<<< HEAD caching_host_allocator) +======= + caching_host_allocator); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator) diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index d7832c761ae55..fa404b0c7711c 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -309,8 +309,12 @@ cublasHandle_t getCurrentCUDABlasHandle() { // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. +<<<<<<< HEAD if (!NoTF32Guard::should_disable_tf32() && at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { +======= + if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); } else { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.cpp b/aten/src/ATen/cuda/PeerToPeerAccess.cpp index 66a75db6ea067..93ca53c77d6df 100644 --- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp +++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp @@ -4,9 +4,12 @@ #include #include +<<<<<<< HEAD #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #include #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -15,7 +18,10 @@ namespace at::cuda { static std::vector p2pAccessEnabled_; +<<<<<<< HEAD static std::vector fabricAccessEnabled_; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static int64_t num_devices_ = -1; namespace detail { @@ -33,15 +39,22 @@ void init_p2p_access_cache(int64_t num_devices) { for (const auto i : c10::irange(num_devices)) { p2pAccessEnabled_[i * num_devices + i] = 1; } +<<<<<<< HEAD fabricAccessEnabled_.clear(); fabricAccessEnabled_.resize(num_devices, -1); } } // namespace detail +======= +} + +} // namespace detail +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { at::globalContext().lazyInitDevice(c10::DeviceType::CUDA); +<<<<<<< HEAD TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device"); TORCH_CHECK( dev_to_access >= 0 || dev_to_access < num_devices_, @@ -50,6 +63,15 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized"); auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access]; +======= + TORCH_CHECK(dev >= 0 || dev < num_devices_, + dev, " is not a device"); + TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_, + dev_to_access, " is not a device"); + TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized"); + + auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (cache != -1) { return cache; @@ -65,6 +87,7 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { return cache; } +<<<<<<< HEAD namespace { #if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED @@ -180,3 +203,6 @@ bool get_fabric_access(c10::DeviceIndex dev) { } } // namespace at::cuda +======= +} // namespace at::cuda::detail +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.h b/aten/src/ATen/cuda/PeerToPeerAccess.h index 30d21af83ed88..c041aa7d6f107 100644 --- a/aten/src/ATen/cuda/PeerToPeerAccess.h +++ b/aten/src/ATen/cuda/PeerToPeerAccess.h @@ -8,6 +8,9 @@ void init_p2p_access_cache(int64_t num_devices); } TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev); +<<<<<<< HEAD TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::cuda diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu index bc863b8880da7..3bcbe9d60dc9b 100644 --- a/aten/src/ATen/cuda/cub.cu +++ b/aten/src/ATen/cuda/cub.cu @@ -15,7 +15,12 @@ struct SumOp { template void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t num_items) { +<<<<<<< HEAD inclusive_scan(input, output, NO_ROCM(::cuda)::std::plus<>{}, num_items); +======= + using NO_ROCM(at_cuda_detail)::cub::Sum; + inclusive_scan(input, output, Sum{}, num_items); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template void inclusive_sum_truncating(const int32_t *input, int32_t *output, int64_t num_items); @@ -41,7 +46,12 @@ struct CountMaskOp { void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n) { CountMaskOp op{}; +<<<<<<< HEAD auto iter = ATEN_CUB_TRANSFORM_ITERATOR(bool, decltype(op), decltype(mask))(mask, op); +======= + auto iter = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator< + bool, decltype(op), decltype(mask)>(mask, op); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) exclusive_scan(iter, output_idx, SumOp{}, int64_t{0}, n); } diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 5c83810164adb..cf121a8604595 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -6,10 +6,13 @@ #include #include +<<<<<<< HEAD #ifndef USE_ROCM #include #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -55,6 +58,7 @@ #define ROCM_HIPCUB(x) x #endif +<<<<<<< HEAD #if CUB_V3_PLUS() #include #include @@ -71,6 +75,13 @@ #endif #if defined(USE_ROCM) +======= +#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM) + +#if !defined(USE_ROCM) +namespace at_cuda_detail { +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 @@ -92,6 +103,13 @@ template <> struct ROCM_HIPCUB(cub)::NumericTraits: ROCM_HIPCUB(cub)::BaseTraits {}; +<<<<<<< HEAD +======= +#if !defined(USE_ROCM) +} // namespace at_cuda_detail +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif #if !defined(USE_ROCM) @@ -113,7 +131,11 @@ struct cuda_type { using type = __half; }; +<<<<<<< HEAD #if !defined(USE_ROCM) +======= +#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template<> struct cuda_type { @@ -169,6 +191,10 @@ inline void segmented_sort_pairs( } } +<<<<<<< HEAD +======= +#if CUB_SUPPORTS_UNIQUE_BY_KEY() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template inline void unique_by_key( KeysInputIteratorT keys_in, ValuesInputIteratorT values_in, @@ -184,6 +210,10 @@ inline void unique_by_key( CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey, keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream()); } +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace impl { @@ -195,6 +225,39 @@ __global__ void transform_vals(InputIteratorT1 a, InputIteratorT2 b, OutputItera *out = scan_op(static_cast(*a), static_cast(*b)); } +<<<<<<< HEAD +======= +#if !CUB_SUPPORTS_FUTURE_VALUE() +template +struct chained_iterator { + using iterator_category = std::random_access_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = ValueT; + using pointer = ValueT*; + using reference = ValueT&; + + InputIteratorT iter; + ValueT *first; + difference_type offset = 0; + + __device__ ValueT operator[](difference_type i) { + i += offset; + if (i == 0) { + return *first; + } else { + return ValueT(iter[i - 1]); + } + } + __device__ chained_iterator operator+(difference_type i) { + return chained_iterator{iter, first, i}; + } + __device__ ValueT operator*() { + return (*this)[0]; + } +}; +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // even though cub is supposed to support tensors with int_max elements, in reality it doesn't, // so split at int_max/2 constexpr int max_cub_size = std::numeric_limits::max() / 2 + 1; // 2**30 @@ -239,6 +302,28 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT first_elem_ptr, scan_op); C10_CUDA_KERNEL_LAUNCH_CHECK(); +<<<<<<< HEAD +======= +#if !CUB_SUPPORTS_FUTURE_VALUE() + using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator; + using tuple = typename ArgIndexInputIterator::value_type; + auto input_iter_transform = [=] __device__ (const tuple &x)->input_t { + if (x.key == 0) { + return *first_elem_ptr; + } else { + return x.value; + } + }; + auto input_ = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator( + ArgIndexInputIterator(input + i), input_iter_transform); + CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan, + input_, + output + i, + scan_op, + size_cub, + at::cuda::getCurrentCUDAStream()); +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan, input + i + 1, output + i, @@ -246,6 +331,10 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT ::at_cuda_detail::cub::FutureValue(first_elem_ptr), size_cub, at::cuda::getCurrentCUDAStream()); +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif } @@ -384,7 +473,11 @@ __global__ void calc_block_sums(const T * d_in, aggT * agg, int64_t nelem, int i aggT data[ITEMS_PER_THREAD]; aggT agg_val = 0; TransformFunctor transform_functor; +<<<<<<< HEAD auto iter_in = ATEN_CUB_TRANSFORM_ITERATOR(aggT, TransformFunctor, const T*)(d_in, transform_functor); +======= + auto iter_in = ROCM_HIPCUB(at_cuda_detail::cub)::TransformInputIterator, const T*>(d_in, transform_functor); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (int i=0; i= BLOCK_THREADS * ITEMS_PER_THREAD) { BlockLoadT(temp_storage.load).Load(iter_in, data); @@ -497,6 +590,19 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT first_elem_ptr, scan_op); C10_CUDA_KERNEL_LAUNCH_CHECK(); +<<<<<<< HEAD +======= +#if !CUB_SUPPORTS_FUTURE_VALUE() + auto input_ = impl::chained_iterator{ + input + i, first_elem_ptr}; + CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan, + input_, + output + i, + scan_op, + size_cub, + at::cuda::getCurrentCUDAStream()); +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan, input + i, output + i, @@ -504,10 +610,18 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT ::at_cuda_detail::cub::FutureValue(first_elem_ptr), size_cub, at::cuda::getCurrentCUDAStream()); +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif } +<<<<<<< HEAD +======= +#if CUB_SUPPORTS_SCAN_BY_KEY() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) { @@ -515,7 +629,11 @@ inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT i "cub InclusiveSumByKey does not support more than INT_MAX elements"); #if !defined(USE_ROCM) CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveSumByKey, +<<<<<<< HEAD keys, input, output, num_items, NO_ROCM(::cuda)::std::equal_to<>(), at::cuda::getCurrentCUDAStream()); +======= + keys, input, output, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else CUB_WRAPPER(cub::DeviceScan::InclusiveSumByKey, keys, input, output, num_items, hipcub::Equality(), at::cuda::getCurrentCUDAStream()); @@ -528,13 +646,21 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT "cub InclusiveSumByKey does not support more than INT_MAX elements"); #if !defined(USE_ROCM) CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveScanByKey, +<<<<<<< HEAD keys, input, output, scan_op, num_items, NO_ROCM(::cuda)::std::equal_to<>(), at::cuda::getCurrentCUDAStream()); +======= + keys, input, output, scan_op, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else CUB_WRAPPER(cub::DeviceScan::InclusiveScanByKey, keys, input, output, scan_op, num_items, hipcub::Equality(), at::cuda::getCurrentCUDAStream()); #endif } +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void unique(InputIteratorT input, OutputIteratorT output, diff --git a/aten/src/ATen/cuda/cub.h b/aten/src/ATen/cuda/cub.h index 7430edaf8a3dc..97b468a806f8e 100644 --- a/aten/src/ATen/cuda/cub.h +++ b/aten/src/ATen/cuda/cub.h @@ -4,7 +4,11 @@ #include // NOTE: These templates are intentionally not defined in this header, +<<<<<<< HEAD // which avoids re-compiling them for each translation unit. If you get +======= +// which aviods re-compiling them for each translation unit. If you get +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // a link error, you need to add an explicit instantiation for your // types in cub.cu diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh index e0d7455d4c22b..1c26e01ea722d 100644 --- a/aten/src/ATen/cuda/cub_definitions.cuh +++ b/aten/src/ATen/cuda/cub_definitions.cuh @@ -10,6 +10,17 @@ #define CUB_VERSION 200001 #endif +<<<<<<< HEAD +======= +// cub sort support for __nv_bfloat16 is added to cub 1.13 in: +// https://github.com/NVIDIA/cub/pull/306 +#if CUB_VERSION >= 101300 +#define CUB_SUPPORTS_NV_BFLOAT16() true +#else +#define CUB_SUPPORTS_NV_BFLOAT16() false +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: // https://github.com/NVIDIA/cub/pull/326 // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake @@ -20,10 +31,35 @@ #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false #endif +<<<<<<< HEAD // There were many bc-breaking changes in major version release of CCCL v3.0.0 // Please see https://nvidia.github.io/cccl/cccl/3.0_migration_guide.html #if CUB_VERSION >= 200800 #define CUB_V3_PLUS() true #else #define CUB_V3_PLUS() false +======= +// cub support for UniqueByKey is added to cub 1.16 in: +// https://github.com/NVIDIA/cub/pull/405 +#if CUB_VERSION >= 101600 +#define CUB_SUPPORTS_UNIQUE_BY_KEY() true +#else +#define CUB_SUPPORTS_UNIQUE_BY_KEY() false +#endif + +// cub support for scan by key is added to cub 1.15 +// in https://github.com/NVIDIA/cub/pull/376 +#if CUB_VERSION >= 101500 +#define CUB_SUPPORTS_SCAN_BY_KEY() 1 +#else +#define CUB_SUPPORTS_SCAN_BY_KEY() 0 +#endif + +// cub support for cub::FutureValue is added to cub 1.15 in: +// https://github.com/NVIDIA/cub/pull/305 +#if CUB_VERSION >= 101500 +#define CUB_SUPPORTS_FUTURE_VALUE() true +#else +#define CUB_SUPPORTS_FUTURE_VALUE() false +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index b7f80101d926e..5cb3f7212f613 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -19,6 +19,13 @@ #include #include +<<<<<<< HEAD +======= +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) +#include +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if AT_CUDNN_ENABLED() #include #endif @@ -89,6 +96,32 @@ void CUDAHooks::init() const { // have a chance to enable vitals. at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true); +<<<<<<< HEAD +======= + // Sets the CUDA_MODULE_LOADING environment variable + // if it's not set by the user. + // CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+. + // Check the driver version and only set the env variable if needed. + bool set_lazy_module_loading = true; + #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) + auto driver_api = c10::cuda::DriverAPI::get(); + // Initialize NVML + if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) { + // Get the driver version + int version = -1; + auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version); + if (res == NVML_SUCCESS) { + // Check if driver is sufficiently new + if (version >= 12020) { + set_lazy_module_loading = false; + } + } + } + #endif + if (set_lazy_module_loading) { + c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto num_devices = c10::cuda::device_count_ensure_non_zero(); c10::cuda::CUDACachingAllocator::init(num_devices); at::cuda::detail::init_p2p_access_cache(num_devices); @@ -180,6 +213,7 @@ bool CUDAHooks::hasCuBLASLt() const { #endif } +<<<<<<< HEAD bool CUDAHooks::hasCKSDPA() const { #if !defined(USE_ROCM) @@ -201,6 +235,8 @@ bool CUDAHooks::hasCKGEMM() const { #endif } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool CUDAHooks::hasROCM() const { // Currently, this is same as `compiledWithMIOpen`. // But in future if there are ROCm builds without MIOpen, @@ -281,9 +317,12 @@ bool CUDAHooks::compiledWithMIOpen() const { bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { #if AT_CUDNN_ENABLED() +<<<<<<< HEAD if (!hasCUDA()) { return false; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOTE: extra parenthesis around numbers disable clang warnings about // dead code return true; @@ -294,9 +333,12 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { #if AT_CUDNN_ENABLED() +<<<<<<< HEAD if (!hasCUDA()) { return false; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); // Check for Volta cores if (prop->major >= 7) { @@ -311,6 +353,7 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const { #if AT_CUDNN_ENABLED() +<<<<<<< HEAD if (!hasCUDA()) { return false; } @@ -331,6 +374,8 @@ bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const { if (!hasCUDA()) { return false; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); // Check for Volta cores if (prop->major >= 8) { diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 8d3d1db003928..a2940b298e09c 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -17,7 +17,11 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)()); // The real implementation of CUDAHooksInterface struct CUDAHooks : public at::CUDAHooksInterface { +<<<<<<< HEAD CUDAHooks(at::CUDAHooksArgs /*unused*/) {} +======= + CUDAHooks(at::CUDAHooksArgs) {} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void init() const override; Device getDeviceFromPtr(void* data) const override; bool isPinnedPtr(const void* data) const override; @@ -31,8 +35,11 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasCuSOLVER() const override; bool hasCuBLASLt() const override; bool hasROCM() const override; +<<<<<<< HEAD bool hasCKSDPA() const override; bool hasCKGEMM() const override; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const at::cuda::NVRTC& nvrtc() const override; DeviceIndex current_device() const override; bool isBuilt() const override {return true;} @@ -45,7 +52,10 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool supportsDilatedConvolutionWithCuDNN() const override; bool supportsDepthwiseConvolutionWithCuDNN() const override; bool supportsBFloat16ConvolutionWithCuDNNv8() const override; +<<<<<<< HEAD bool supportsBFloat16RNNWithCuDNN() const override; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool hasCUDART() const override; long versionCUDART() const override; long versionCuDNN() const override; diff --git a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h index 71a344d281d2a..0a05477e08555 100644 --- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h +++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h @@ -122,7 +122,11 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto parent = weak_parent.lock(); if (!parent) { // If this thread exits after atexit handlers have completed, the diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh index 487e798bd80f6..5a050d1937e87 100644 --- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh +++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh @@ -49,12 +49,20 @@ struct OffsetCalculator { #if defined(USE_ROCM) if ((dims > 0) && (dims <= 2)) { auto divmod = sizes_[0].divmod(linear_idx); +<<<<<<< HEAD #pragma unroll +======= + #pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (int arg = 0; arg < NARGS; arg++) offsets[arg] = divmod.mod * strides_[0][arg]; if (dims >= 2) { divmod = sizes_[1].divmod(divmod.div); +<<<<<<< HEAD #pragma unroll +======= + #pragma unroll +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (int arg = 0; arg < NARGS; arg++) offsets[arg] += divmod.mod * strides_[1][arg]; } diff --git a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh index 7de0321256fd7..5a63e455db600 100644 --- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh +++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh @@ -19,7 +19,11 @@ struct PhiloxCudaState { // Called if graph capture is underway PhiloxCudaState(int64_t* seed, int64_t* offset_extragraph, +<<<<<<< HEAD uint64_t offset_intragraph) { +======= + uint32_t offset_intragraph) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) seed_.ptr = seed; offset_.ptr = offset_extragraph; offset_intragraph_ = offset_intragraph; @@ -36,7 +40,11 @@ struct PhiloxCudaState { Payload seed_{}; Payload offset_{}; +<<<<<<< HEAD uint64_t offset_intragraph_ = 0; +======= + uint32_t offset_intragraph_ = 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool captured_ = false; }; diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index aca83386ad421..cbf4d811ebf14 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -117,8 +117,11 @@ namespace at::cuda { _(nvrtcGetPTXSize) \ _(nvrtcGetPTX) \ _(cuModuleLoadData) \ +<<<<<<< HEAD _(cuModuleLoad) \ _(cuGetErrorString) \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) _(cuModuleGetFunction) \ _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \ _(nvrtcGetErrorString) \ diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h index 5d9e33b2b5b2f..4ff019366af75 100644 --- a/aten/src/ATen/cuda/tunable/GemmCommon.h +++ b/aten/src/ATen/cuda/tunable/GemmCommon.h @@ -13,7 +13,10 @@ #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -30,8 +33,11 @@ namespace at::cuda::tunable { +<<<<<<< HEAD using at::blas::ScalingType; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) enum class BlasOp { N = 0, T = 1 @@ -151,7 +157,10 @@ inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) { BLASType = "unknown"; } return BLASType; +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Similar to Compute Type in GemmRocblas.h @@ -164,7 +173,11 @@ inline std::string ComputeTypeFor() { // ROCBLAS and hipBLASLt. template <> inline std::string ComputeTypeFor() { +<<<<<<< HEAD if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) { +======= + if (!at::globalContext().allowTF32CuBLAS()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return "f32_r"; } else { return "xf32_r"; @@ -246,6 +259,7 @@ inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivatio namespace detail { +<<<<<<< HEAD static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) { if (!config.enabled) { @@ -253,10 +267,16 @@ static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t siz } auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA); +======= +static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) { + auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA); + // comparison done as 1D tensor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::Tensor ref = at::from_blob(c, {size}, options); at::Tensor oth = at::from_blob(other_c, {size}, options); at::Tensor ref_float = ref.to(at::kFloat); at::Tensor oth_float = oth.to(at::kFloat); +<<<<<<< HEAD const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol); if (ok) { @@ -265,6 +285,28 @@ static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t siz TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol); } return ok; +======= + std::vector atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + std::vector rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + double last_succeed_atol = 1; + double last_succeed_rtol = 1; + for (auto& atol : atols) { + for (auto& rtol : rtols) { + if (at::allclose(ref_float, oth_float, rtol, atol)) { + last_succeed_atol = atol; + last_succeed_rtol = rtol; + } + } + } + if (last_succeed_atol == 1) { + return false; + } + else { + TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol); + } + + return true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } @@ -349,10 +391,15 @@ struct GemmParams : OpParams { } TuningStatus NumericalCheck(GemmParams *other) { +<<<<<<< HEAD auto* ctx = getTuningContext(); auto cfg = ctx->GetNumericalCheckConfig(); auto c_dtype = c10::CppTypeToScalarType::value; return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; +======= + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } char transa{}; @@ -445,10 +492,15 @@ struct GemmAndBiasParams : OpParams { } TuningStatus NumericalCheck(GemmAndBiasParams *other) { +<<<<<<< HEAD auto* ctx = getTuningContext(); auto cfg = ctx->GetNumericalCheckConfig(); auto c_dtype = c10::CppTypeToScalarType::value; return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; +======= + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } char transa{}; @@ -544,10 +596,15 @@ struct GemmStridedBatchedParams : OpParams { } TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { +<<<<<<< HEAD auto* ctx = getTuningContext(); auto cfg = ctx->GetNumericalCheckConfig(); auto c_dtype = c10::CppTypeToScalarType::value; return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; +======= + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } char transa{}; @@ -600,8 +657,12 @@ struct ScaledGemmParams : OpParams { // // In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector. return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s", +<<<<<<< HEAD transa, transb, m, n, k, lda, ldb, ldc, a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise, +======= + transa, transb, m, n, k, lda, ldb, ldc, use_rowwise, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bias_ptr == nullptr ? "None" : at::toString(bias_dtype)); } @@ -663,9 +724,13 @@ struct ScaledGemmParams : OpParams { } TuningStatus NumericalCheck(ScaledGemmParams *other) { +<<<<<<< HEAD auto* ctx = getTuningContext(); auto cfg = ctx->GetNumericalCheckConfig(); return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; +======= + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } char transa{}; @@ -678,13 +743,19 @@ struct ScaledGemmParams : OpParams { int64_t lda{}; ScalarType a_dtype{}; ScalarType a_scale_dtype{}; +<<<<<<< HEAD ScalingType a_scaling_type{}; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const void* b{}; const void* b_scale_ptr{}; int64_t ldb{}; ScalarType b_dtype{}; ScalarType b_scale_dtype{}; +<<<<<<< HEAD ScalingType b_scaling_type{}; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const void* bias_ptr{}; ScalarType bias_dtype{}; void* c{}; @@ -693,6 +764,10 @@ struct ScaledGemmParams : OpParams { ScalarType c_dtype{}; void* amax_ptr{}; bool use_fast_accum{}; +<<<<<<< HEAD +======= + bool use_rowwise{}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) private: bool duplicate_inputs_{false}; }; diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index 10490b0323ed9..bd80943804c47 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -14,7 +14,10 @@ #include #include +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define TORCH_HIPBLASLT_CHECK(EXPR) \ do { \ hipblasStatus_t __err = EXPR; \ @@ -216,6 +219,7 @@ float GetBetaFromParams(const ScaledGemmParams* params) { } template +<<<<<<< HEAD ScalingType GetAScalingTypeFromParams(const GemmParams* params) { return ScalingType::TensorWise; } @@ -253,6 +257,25 @@ ScalingType GetAScalingTypeFromParams(const ScaledGemmParams* params) { template ScalingType GetBScalingTypeFromParams(const ScaledGemmParams* params) { return params->b_scaling_type; +======= +bool GetUseRowwiseFromParams(const GemmParams* params) { + return false; +} + +template +bool GetUseRowwiseFromParams(const GemmAndBiasParams* params) { + return false; +} + +template +bool GetUseRowwiseFromParams(const GemmStridedBatchedParams* params) { + return false; +} + +template +bool GetUseRowwiseFromParams(const ScaledGemmParams* params) { + return params->use_rowwise; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template @@ -507,7 +530,11 @@ class HipblasltGemmOp : public Callable { } hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; +<<<<<<< HEAD if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { +======= + if (at::globalContext().allowTF32CuBLAS()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; } HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F); @@ -519,6 +546,7 @@ class HipblasltGemmOp : public Callable { const void* mat2_scale_ptr = GetBScalePointerFromParams(params); const void* result_scale_ptr = GetDScalePointerFromParams(params); if (mat1_scale_ptr && mat2_scale_ptr) { +<<<<<<< HEAD hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER; hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER; if (GetAScalingTypeFromParams(params) == ScalingType::RowWise) { @@ -537,6 +565,25 @@ class HipblasltGemmOp : public Callable { } matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr); matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr); +======= +#ifdef HIPBLASLT_VEC_EXT + if (GetUseRowwiseFromParams(params)) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT, mat1_scale_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT, mat2_scale_ptr); + } + else +#endif + { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr); + } +#ifdef HIPBLASLT_OUTER_VEC + if (GetUseRowwiseFromParams(params)) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (result_scale_ptr) { matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h index 60eaa2e4d4754..234456bd937e5 100644 --- a/aten/src/ATen/cuda/tunable/GemmRocblas.h +++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h @@ -141,7 +141,11 @@ class RocblasGemmOp : public Callable> { TuningStatus Call(const GemmParams* params) override { auto input_output_type = RocBlasDataTypeFor(); +<<<<<<< HEAD if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) +======= + if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return FAIL; // no support for TF32 in rocBLAS auto compute_type = RocBlasComputeTypeFor(); auto h_a = DoCastForHalfOrBfloat16(params->alpha); @@ -209,7 +213,11 @@ class RocblasGemmStridedBatchedOp : public Callable> TuningStatus Call(const GemmStridedBatchedParams* params) override { auto input_output_type = RocBlasDataTypeFor(); +<<<<<<< HEAD if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) +======= + if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return FAIL; // no support for TF32 in rocBLAS auto compute_type = RocBlasComputeTypeFor(); auto h_a = DoCastForHalfOrBfloat16(params->alpha); diff --git a/aten/src/ATen/cuda/tunable/README.md b/aten/src/ATen/cuda/tunable/README.md index 1a9c91dab7c0b..eca37cee98b5f 100644 --- a/aten/src/ATen/cuda/tunable/README.md +++ b/aten/src/ATen/cuda/tunable/README.md @@ -38,7 +38,11 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262 GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033 ``` +<<<<<<< HEAD Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect +======= +Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) this and reject the tunings file because the prior tunings are likely affected by other software changes. The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of @@ -145,7 +149,11 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins | PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. | | PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. | | PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. | +<<<<<<< HEAD | PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is off. Set 'atol_rtol' to enable, for example "1e-5_1e-5". | +======= +| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is 0. Set to 1 to enable. | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) | PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. | | PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. | | PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. | @@ -154,7 +162,11 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins | PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS | Default is 0, meaning it is not used. | | PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED | Default is 1. Set to 0 to disable. | | PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE | Default (or < 0) is to query L2 cache size. Set to 0 to disable. Otherwise, set to the number of MiB to use for the pool of operator parameters. For example, setting this to the size of your device's memory cache will guarantee that every tuning iteration will use a cold cache. | +<<<<<<< HEAD | PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS parameters to tuning CSV file. | +======= +| PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS paramters to tuning CSV file. | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ### Python Interface All python APIs exist in the `torch.cuda.tunable` module. @@ -173,9 +185,16 @@ All python APIs exist in the `torch.cuda.tunable` module. | get_max_tuning_iterations() -> int | | | set_filename(filename: str, insert_device_ordinal: bool = False) -> None | | | get_filename() -> str | | +<<<<<<< HEAD | set_numerical_check_tolerances(enable: bool, atol: float, rtol: float) -> None | Enable or disable numerical checking; atol and rtol default to 1e-5. | get_results() -> Tuple[str, str, str, float] | | | get_validators() -> Tuple[str, str] | | +======= +| get_results() -> Tuple[str, str, str, float] | | +| get_validators() -> Tuple[str, str] | | +| write_file_on_exit(val: bool) -> None | Default is True. | +| write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). | +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) | read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). | | tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. | | mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. | diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp index 9fb04b40d30f6..5096059ba48e1 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.cpp +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -107,6 +107,7 @@ void TuningResultsManager::AddImpl(const std::string& op_signature, } void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) { +<<<<<<< HEAD bool is_new = false; ResultEntry inserted = ResultEntry::Null(); @@ -131,6 +132,16 @@ void TuningResultsManager::Add(const std::string& op_signature, const std::strin } } +======= + std::scoped_lock l{lock_}; + + auto it = results_.find(op_signature); + if (it == results_.end()) { + it = results_.insert({op_signature, {}}).first; + } + + AddImpl(op_signature, params_signature, std::move(best), it->second); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, @@ -166,6 +177,7 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std } } +<<<<<<< HEAD void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const std::unordered_map& validators) { std::scoped_lock fl{realtime_file_mutex_}; @@ -237,6 +249,8 @@ void TuningResultsManager::CloseRealtimeAppend() { } } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) { std::scoped_lock l{lock_}; @@ -307,6 +321,7 @@ TuningResultsValidator::TuningResultsValidator() { []() { return GetPyTorchVersion(); }, [this](auto&& k) { return ValidatePyTorchVersion(std::forward(k)); }); #ifdef USE_ROCM +<<<<<<< HEAD // hip { // HIP version is more accurate than ROCm version. User's environment could be a stock @@ -318,6 +333,21 @@ TuningResultsValidator::TuningResultsValidator() { [hip_version](auto&& k) { TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version); return hip_version == k ? OK : FAIL; +======= + // rocm + { +#ifdef _WIN32 + std::string rocm_version = HIP_VERSION_BUILD_NAME; +#else + std::string rocm_version = ROCM_BUILD_INFO; +#endif + RegisterValidator( + "ROCM_VERSION", + [rocm_version]() { return rocm_version; }, + [rocm_version](auto&& k) { + TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version); + return rocm_version == k ? OK : FAIL; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } // gfx arch @@ -483,6 +513,10 @@ TuningContext::TuningContext() : tuning_enable_{true}, record_untuned_enable_{false}, manager_initialized_{false}, +<<<<<<< HEAD +======= + write_file_on_exit_{true}, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) numerics_check_enable_{false}, max_tuning_duration_ms_{30}, max_tuning_iterations_{100}, @@ -490,6 +524,11 @@ TuningContext::TuningContext() : max_warmup_iterations_{0}, icache_flush_{true}, rotating_buffer_size_{-1}, +<<<<<<< HEAD +======= + filename_{}, + untuned_file_{}, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) results_count_from_input_file_{0}, is_shutting_down_{false} { @@ -503,8 +542,25 @@ TuningContext::~TuningContext() { // but doesn't do any computation itself. return; } +<<<<<<< HEAD TUNABLE_LOG1("Closing File"); GetTuningResultsManager().CloseRealtimeAppend(); // Since, we do instant logging by default now. +======= + auto filename = GetFilename(); + if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty() && write_file_on_exit_) { + if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) { + if (results_count_from_input_file_ > 0) { + TUNABLE_LOG1("additional tuning results available, rewriting file ", filename); + } + else { + TUNABLE_LOG1("writing file ", filename); + } + if (!WriteFile(filename)) { + TUNABLE_LOG1("failed to write file ", filename); + } + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (untuned_file_.good()) { untuned_file_.close(); @@ -580,16 +636,27 @@ std::ofstream& TuningContext::GetUntunedFile(){ filename.append(device); } +<<<<<<< HEAD untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app); +======= + untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return untuned_file_; } +<<<<<<< HEAD +======= +void TuningContext::WriteFileOnExit(bool value) { + write_file_on_exit_ = value; +} +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void TuningContext::EnableNumericsCheck(bool value) { numerics_check_enable_ = value; } +<<<<<<< HEAD NumericalCheckConfig TuningContext::GetNumericalCheckConfig() const { const auto env_opt = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); @@ -633,6 +700,14 @@ void TuningContext::SetNumericalCheckConfig(bool enabled, double atol, double rt bool TuningContext::IsNumericsCheckEnabled() const { const auto cfg = GetNumericalCheckConfig(); return cfg.enabled || numerics_check_enable_; +======= +bool TuningContext::IsNumericsCheckEnabled() const { + const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); + if (env == "1") { + return true; + } + return numerics_check_enable_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) { @@ -742,6 +817,14 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() { auto filename = GetFilename(); if (!filename.empty() && !IsRecordUntunedEnabled()) { ReadFile(filename); +<<<<<<< HEAD +======= + // attempt immediately to open file for writing to catch errors early + std::ofstream file(filename, std::ios::out | std::ios::app); + if (!file.good()) { + TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }); return manager_; @@ -847,6 +930,30 @@ bool TuningContext::ReadFile(const std::string& filename_) { return true; } +<<<<<<< HEAD +======= +bool TuningContext::WriteFile(const std::string& filename_) { + std::string filename = filename_.empty() ? GetFilename() : filename_; + std::ofstream file(filename, std::ios::out | std::ios::trunc); + if (!file.good()) { + TUNABLE_LOG1("error opening tuning results file for writing ", filename); + return false; + } + auto validators = GetTuningResultsValidator().GetAllValidators(); + for (const auto& [key, val] : validators) { + file << "Validator," << key << "," << val << std::endl; + } + auto results = GetTuningResultsManager().Dump(); + for (const auto& [op_sig, kernelmap] : results) { + for (const auto& [param_sig, result] : kernelmap) { + file << op_sig << "," << param_sig << "," << result << std::endl; + } + } + file.close(); + return true; +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { struct MaybeDelete { diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h index 17b4ea34ddf61..999864c8731b8 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.h +++ b/aten/src/ATen/cuda/tunable/Tunable.h @@ -103,6 +103,7 @@ class TORCH_CUDA_CPP_API TuningResultsManager { void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, const std::string& params_signature, const std::string& blas_signature); +<<<<<<< HEAD void InitRealtimeAppend( const std::string& filename, @@ -121,6 +122,12 @@ class TORCH_CUDA_CPP_API TuningResultsManager { ResultsMap results_; UntunedMap untuned_results_; bool validators_written_ = false; +======= + private: + std::mutex lock_; + ResultsMap results_; + UntunedMap untuned_results_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; @@ -148,6 +155,7 @@ class TORCH_CUDA_CPP_API TuningResultsValidator { GetValidateFuncs validators_; }; +<<<<<<< HEAD struct NumericalCheckConfig { bool enabled{false}; double atol{1e-5}; @@ -158,6 +166,8 @@ struct NumericalCheckConfig { }; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class TORCH_CUDA_CPP_API TuningContext { public: TuningContext(); @@ -179,8 +189,11 @@ class TORCH_CUDA_CPP_API TuningContext { void EnableNumericsCheck(bool value); bool IsNumericsCheckEnabled() const; +<<<<<<< HEAD void SetNumericalCheckConfig(bool enabled, double atol, double rtol); NumericalCheckConfig GetNumericalCheckConfig() const; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void SetMaxTuningDurationMs(int max_duration_ms); int GetMaxTuningDurationMs() const; @@ -211,7 +224,14 @@ class TORCH_CUDA_CPP_API TuningContext { void SetFilename(const std::string& filename, bool insert_device_ordinal=false); std::string GetFilename() const; +<<<<<<< HEAD + bool ReadFile(const std::string& filename={}); +======= + void WriteFileOnExit(bool value); + bool ReadFile(const std::string& filename={}); + bool WriteFile(const std::string& filename={}); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template void Log(int level, Types... args) { @@ -230,6 +250,10 @@ class TORCH_CUDA_CPP_API TuningContext { bool tuning_enable_; bool record_untuned_enable_; bool manager_initialized_; +<<<<<<< HEAD +======= + bool write_file_on_exit_; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool numerics_check_enable_; int max_tuning_duration_ms_; int max_tuning_iterations_; @@ -244,8 +268,11 @@ class TORCH_CUDA_CPP_API TuningContext { std::ofstream untuned_file_; size_t results_count_from_input_file_; bool is_shutting_down_; +<<<<<<< HEAD NumericalCheckConfig numerics_cfg_{}; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; TORCH_CUDA_CPP_API TuningContext* getTuningContext(); diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h index c014d1ea569c8..801d48a2e8045 100644 --- a/aten/src/ATen/cuda/tunable/TunableGemm.h +++ b/aten/src/ATen/cuda/tunable/TunableGemm.h @@ -96,13 +96,19 @@ class DefaultScaledGemmOp : public Callable> { params->lda, params->a_dtype, params->a_scale_dtype, +<<<<<<< HEAD params->a_scaling_type, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) params->b, params->b_scale_ptr, params->ldb, params->b_dtype, params->b_scale_dtype, +<<<<<<< HEAD params->b_scaling_type, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) params->bias_ptr, params->bias_dtype, params->c, @@ -110,7 +116,11 @@ class DefaultScaledGemmOp : public Callable> { params->ldc, params->c_dtype, params->use_fast_accum, +<<<<<<< HEAD std::nullopt /* alpha */); +======= + params->use_rowwise); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return OK; } }; diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h index 830473cb4ca9e..7620754fba887 100644 --- a/aten/src/ATen/cuda/tunable/TunableOp.h +++ b/aten/src/ATen/cuda/tunable/TunableOp.h @@ -29,7 +29,11 @@ template class Callable { public: virtual ~Callable() = default; +<<<<<<< HEAD virtual TuningStatus Call(const ParamsT* /*unused*/) { +======= + virtual TuningStatus Call(const ParamsT*) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return FAIL; } virtual TuningStatus IsSupported(const ParamsT* params) { @@ -235,7 +239,11 @@ class TunableOp { // numeric check option is controlled by non-static env var, so check it once per tuned operator bool do_numerics_check = ctx->IsNumericsCheckEnabled(); +<<<<<<< HEAD // calculate a reference answer for numerical check +======= + // calcaulte a reference answer for numerical check +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (do_numerics_check) { reference_params = params->DeepCopy(false); TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK); @@ -267,10 +275,34 @@ class TunableOp { for (size_t i = 0; i < op_names_.size(); i++) { auto* candidate = ops_[op_names_[i]].get(); // borrow pointer +<<<<<<< HEAD auto status = candidate->Call(reusable_params[0]); if (status != OK) { TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); continue; +======= + if (do_numerics_check) { + ParamsT* numerical_params = params->DeepCopy(false); + auto status = candidate->Call(numerical_params); + if (status != OK) { + numerical_params->Delete(); + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + status = reference_params->NumericalCheck(numerical_params); + numerical_params->Delete(); + if (status != OK) { + TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + } + else { + auto status = candidate->Call(reusable_params[0]); + if (status != OK) { + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // collect a small profile @@ -293,6 +325,7 @@ class TunableOp { continue; } +<<<<<<< HEAD if (do_numerics_check) { ParamsT* numerical_params = params->DeepCopy(false); auto status = candidate->Call(numerical_params); @@ -309,6 +342,8 @@ class TunableOp { } } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // for warmup does user set max duration, max iters, or both? // warmup is skipped by default, i.e. warmup_iter = 0 // warmup will be set to the non-zero value of max_warmup_duration diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index dbd178e0f8eee..b9f1e692e0f72 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -141,7 +141,11 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo size[i] = (int) t.size(i); } for (const auto i : c10::irange(dim, pad)) { +<<<<<<< HEAD size[i] = 1; +======= + size[i] = (int) 1; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } dim = std::max(dim, pad); cudnnTensorFormat_t filter_format{}; diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp index f612436f56724..fe7b8a17342bd 100644 --- a/aten/src/ATen/cudnn/Types.cpp +++ b/aten/src/ATen/cudnn/Types.cpp @@ -2,8 +2,11 @@ #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::native { cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) { @@ -22,10 +25,16 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) { } else if (dtype == at::kByte) { return CUDNN_DATA_UINT8; } +<<<<<<< HEAD TORCH_CHECK(false, "getCudnnDataTypeFromScalarType() not supported for ", toString(dtype) ); +======= + std::string msg("getCudnnDataTypeFromScalarType() not supported for "); + msg += toString(dtype); + throw std::runtime_error(msg); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) { diff --git a/aten/src/ATen/detail/AcceleratorHooksInterface.h b/aten/src/ATen/detail/AcceleratorHooksInterface.h index fb9e51ded83e3..09494977ca578 100644 --- a/aten/src/ATen/detail/AcceleratorHooksInterface.h +++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h @@ -12,7 +12,11 @@ namespace at { // AcceleratorHooksInterface is a shared interface provided by all // accelerators to allow generic code. +<<<<<<< HEAD // This interface is hook-based as it corresponds to all the functions +======= +// This inferface is hook-based as it corresponds to all the functions +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // that are going to be called in a generic way from the CPU code. struct TORCH_API AcceleratorHooksInterface { diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index f1f2056917472..6179d66909a2f 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -118,6 +118,7 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { return false; } +<<<<<<< HEAD virtual bool hasCKSDPA() const { return false; } @@ -126,6 +127,8 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { return false; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) virtual const at::cuda::NVRTC& nvrtc() const { TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP); } @@ -166,10 +169,13 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { return false; } +<<<<<<< HEAD virtual bool supportsBFloat16RNNWithCuDNN() const { return false; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) virtual long versionCuDNN() const { TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); } diff --git a/aten/src/ATen/detail/HPUHooksInterface.h b/aten/src/ATen/detail/HPUHooksInterface.h index 3240ff4dac137..3649f4e39ea4a 100644 --- a/aten/src/ATen/detail/HPUHooksInterface.h +++ b/aten/src/ATen/detail/HPUHooksInterface.h @@ -25,7 +25,11 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface { false, "Cannot get device of pointer on HPU without HPU backend"); } +<<<<<<< HEAD bool isPinnedPtr(const void* /*data*/) const override { +======= + bool isPinnedPtr(const void*) const override { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return false; } diff --git a/aten/src/ATen/detail/MTIAHooksInterface.cpp b/aten/src/ATen/detail/MTIAHooksInterface.cpp index d2e331abb0c04..f4bd44b96649a 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.cpp +++ b/aten/src/ATen/detail/MTIAHooksInterface.cpp @@ -21,10 +21,13 @@ bool isMTIAHooksBuilt() { } // namespace detail +<<<<<<< HEAD bool MTIAHooksInterface::isAvailable() const { return detail::isMTIAHooksBuilt() && detail::getMTIAHooks().deviceCount() > 0; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs) } // namespace at diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h index b415862f29e7c..5e140a93a1190 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.h +++ b/aten/src/ATen/detail/MTIAHooksInterface.h @@ -149,8 +149,11 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { FAIL_MTIAHOOKS_FUNC(__func__); return; } +<<<<<<< HEAD virtual bool isAvailable() const override; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; struct TORCH_API MTIAHooksArgs {}; diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h index 1ab3e99e10773..78d169fa0e2e6 100644 --- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h +++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h @@ -38,7 +38,11 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface { Generator getNewGenerator( [[maybe_unused]] DeviceIndex device_index = -1) const override { +<<<<<<< HEAD // TODO(FFFrog): Preserved for BC and will be removed in the future. +======= + // TODO(FFFrog): Perserved for BC and will be removed in the future. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (at::GetGeneratorPrivate().has_value()) return at::GetGeneratorForPrivateuse1(device_index); diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h index f1b3ae2b7760b..8082efaf83684 100644 --- a/aten/src/ATen/dlpack.h +++ b/aten/src/ATen/dlpack.h @@ -15,11 +15,19 @@ #define DLPACK_EXTERN_C #endif +<<<<<<< HEAD /*! \brief The current major version of dlpack */ #define DLPACK_MAJOR_VERSION 1 /*! \brief The current minor version of dlpack */ #define DLPACK_MINOR_VERSION 1 +======= +/*! \brief The current version of dlpack */ +#define DLPACK_VERSION 80 + +/*! \brief The current ABI version of dlpack */ +#define DLPACK_ABI_VERSION 1 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 @@ -32,12 +40,19 @@ #define DLPACK_DLL #endif +<<<<<<< HEAD +#include +======= +// NOLINTNEXTLINE(modernize-deprecated-headers) #include +// NOLINTNEXTLINE(modernize-deprecated-headers) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #ifdef __cplusplus extern "C" { #endif +<<<<<<< HEAD /*! * \brief The DLPack version. @@ -65,6 +80,8 @@ typedef struct { uint32_t minor; } DLPackVersion; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /*! * \brief The device type in DLDevice. */ @@ -116,7 +133,11 @@ typedef enum { kDLWebGPU = 15, /*! \brief Qualcomm Hexagon DSP */ kDLHexagon = 16, +<<<<<<< HEAD /*! \brief Microsoft MAIA devices */ +======= + /*! \brief Microsoft AI Accelerator */ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) kDLMAIA = 17, } DLDeviceType; @@ -157,6 +178,7 @@ typedef enum { kDLComplex = 5U, /*! \brief boolean */ kDLBool = 6U, +<<<<<<< HEAD /*! \brief FP8 data types */ kDLFloat8_e3m4 = 7U, kDLFloat8_e4m3 = 8U, @@ -177,6 +199,8 @@ typedef enum { * while the consumer must stop importing if the value is unexpected. */ kDLFloat4_e2m1fn = 17U, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } DLDataTypeCode; /*! @@ -190,12 +214,15 @@ typedef enum { * - int8: type_code = 0, bits = 8, lanes = 1 * - std::complex: type_code = 5, bits = 64, lanes = 1 * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) +<<<<<<< HEAD * - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) * - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) * - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) * * When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., * for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) */ typedef struct { /*! @@ -223,7 +250,11 @@ typedef struct { * `byte_offset` field should be used to point to the beginning of the data. * * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, +<<<<<<< HEAD * TVM, perhaps others) do not adhere to this 256 byte alignment requirement +======= + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed * (after which this note will be updated); at the moment it is recommended * to not rely on the data pointer being correctly aligned. @@ -241,9 +272,12 @@ typedef struct { * return size; * } * \endcode +<<<<<<< HEAD * * Note that if the tensor is of size zero, then the data pointer should be * set to `NULL`. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) */ void* data; /*! \brief The device of the tensor */ @@ -253,12 +287,20 @@ typedef struct { /*! \brief The data type of the pointer*/ DLDataType dtype; /*! \brief The shape of the tensor */ +<<<<<<< HEAD int64_t* shape; +======= + const int64_t* shape; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /*! * \brief strides of the tensor (in number of elements, not bytes) * can be NULL, indicating tensor is compact and row-majored. */ +<<<<<<< HEAD int64_t* strides; +======= + const int64_t* strides; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /*! \brief The offset in bytes to the beginning pointer to data */ uint64_t byte_offset; } DLTensor; @@ -269,6 +311,7 @@ typedef struct { * not meant to transfer the tensor. When the borrowing framework doesn't need * the tensor, it should call the deleter to notify the host that the resource * is no longer needed. +<<<<<<< HEAD * * \note This data structure is used as Legacy DLManagedTensor * in DLPack exchange and is deprecated after DLPack v0.8 @@ -276,6 +319,8 @@ typedef struct { * This data structure may get renamed or deleted in future versions. * * \sa DLManagedTensorVersioned +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) */ typedef struct DLManagedTensor { /*! \brief DLTensor which is being memory managed */ @@ -284,6 +329,7 @@ typedef struct DLManagedTensor { * which DLManagedTensor is used in the framework. It can also be NULL. */ void * manager_ctx; +<<<<<<< HEAD /*! * \brief Destructor - this should be called * to destruct the manager_ctx which backs the DLManagedTensor. It can be @@ -360,6 +406,15 @@ struct DLManagedTensorVersioned { DLTensor dl_tensor; }; +======= + /*! \brief Destructor signature void (*)(void*) - this should be called + * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL + * if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef __cplusplus } // DLPACK_EXTERN_C #endif diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index d58d436c511d1..5cb6684c332f2 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -158,7 +158,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) { OP_DECOMPOSE(kron); OP_DECOMPOSE(l1_loss); m.impl("layer_norm", native::layer_norm_symint); +<<<<<<< HEAD m.impl("_fused_rms_norm", native::rms_norm_composite); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) OP_DECOMPOSE2(ldexp, Tensor); OP_DECOMPOSE2(less_equal, Tensor ); OP_DECOMPOSE2(less, Tensor ); diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h index 0d2f075d0c540..e916e9561c206 100644 --- a/aten/src/ATen/functorch/BatchRulesHelper.h +++ b/aten/src/ATen/functorch/BatchRulesHelper.h @@ -283,7 +283,11 @@ inline void boxed_existing_bdim_all_batch_rule( // Use when all tensors arguments accept one (normal) batch dim. // This batching rule expands the batch dim on all Tensors, reshapes it into // dim 0, calls the op, and then reshapes the batch dim out of dim 0. +<<<<<<< HEAD // This is not the most efficient thing; if there are alternatives, please try +======= +// This is not the most efficient thing; if there are alternatives, plese try +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // to use them. Use this only as a last resort. #define EXISTING_BDIM_ALL_BOXED(op) \ m.impl(#op, torch::CppFunction::makeFromBoxedFunction()); @@ -410,7 +414,11 @@ struct ExistingBdimBatchRuleHelper +<<<<<<< HEAD Tensor& unary_inplace_batch_rule(Tensor& self, std::optional /*unused*/, ExtraArgs... extra_args) { +======= +Tensor& unary_inplace_batch_rule(Tensor& self, std::optional, ExtraArgs... extra_args) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) INVOKE(self, Method)(std::forward(extra_args)...); return self; } diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp index 804d6953bd410..4de3fa3d0ef8a 100644 --- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp +++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp @@ -39,7 +39,11 @@ Tensor vdot_decomp(const Tensor& A, const Tensor& B) { // NB: I wrote this like this because we *might* want its for a future matmul // batch rule that isn't decomposed... // "tv" = tensor @ vector +<<<<<<< HEAD std::tuple> tv_batch_rule( +======= +static std::tuple> tv_batch_rule( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { if (self_bdim && other_bdim) { @@ -66,7 +70,11 @@ std::tuple> tv_batch_rule( TORCH_INTERNAL_ASSERT(false, "can't get here"); } +<<<<<<< HEAD std::tuple> mv_batch_rule( +======= +static std::tuple> mv_batch_rule( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); @@ -79,7 +87,11 @@ std::tuple> mv_batch_rule( return tv_batch_rule(self, self_bdim, other, other_bdim); } +<<<<<<< HEAD std::tuple> mm_batch_rule( +======= +static std::tuple> mm_batch_rule( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); @@ -94,7 +106,11 @@ std::tuple> mm_batch_rule( return std::make_tuple( at::matmul(self_, other_), 0 ); } +<<<<<<< HEAD std::tuple> bmm_batch_rule( +======= +static std::tuple> bmm_batch_rule( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); @@ -176,7 +192,11 @@ struct LinalgCheckMatrixUnaryRuleHelper; template struct LinalgCheckMatrixUnaryRuleHelper> { +<<<<<<< HEAD static Tensor check_and_reshape_input(const Tensor& tensor, std::optional batch_dim) { +======= + static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional batch_dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions."); return moveBatchDimToFront(tensor, batch_dim); } @@ -222,7 +242,11 @@ struct LinalgCheckMatrixBinaryRuleHelper; template struct LinalgCheckMatrixBinaryRuleHelper> { +<<<<<<< HEAD static std::tuple check_inputs_and_reshape_inputs( +======= + static inline std::tuple check_inputs_and_reshape_inputs( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& first, std::optional first_bdim, const Tensor& second, std::optional second_bdim) { TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2, @@ -250,7 +274,11 @@ struct LinalgCheckMatrixBinaryRuleHelper> } }; +<<<<<<< HEAD void expect_at_least_rank( +======= +static void expect_at_least_rank( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& tensor, std::optional tensor_bdim, int64_t expected_rank, @@ -384,7 +412,11 @@ fourOutputs solve_ex_batch_rule( // NOTE [ solve_ex Batch Rule Contiguity ] // A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on +<<<<<<< HEAD // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior +======= + // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // differs based on whether or not the optimized path was taken const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous(); if (batched_A_was_contiguous && !A.is_complex()) { @@ -472,7 +504,11 @@ atol_rtol_tensor_batch_rule( return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0); } +<<<<<<< HEAD std::tuple> +======= +static std::tuple> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pinv_batch_rule( const Tensor& input, std::optional input_bdim, const std::optional& atol, const std::optional atol_bdim, const std::optional& rtol, diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp index 5fba8d257ceb8..6cf7194037f38 100644 --- a/aten/src/ATen/functorch/BatchRulesModules.cpp +++ b/aten/src/ATen/functorch/BatchRulesModules.cpp @@ -7,7 +7,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include @@ -45,6 +48,7 @@ static std::tuple> embedding_batch_rule( const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight); auto indices_ = moveBatchDimToFront(indices, indices_bdim); +<<<<<<< HEAD { // getStepTensor returns a regular Tensor. If indices_ is a DTensor // we want to allow this mixed DTensor-Tensor operation. @@ -52,6 +56,10 @@ static std::tuple> embedding_batch_rule( const auto range = getStepTensor(indices, batch_size, num_embeddings); indices_ = indices_ + range; } +======= + const auto range = getStepTensor(indices, batch_size, num_embeddings); + indices_ = indices_ + range; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse); return std::make_tuple(std::move(result), 0); } @@ -213,6 +221,7 @@ static cudnn_grid_sample_backward_batch_rule( return grid_sample_backward_helper_out(std::move(bw_out), 0, 0, bdim_size); } +<<<<<<< HEAD // uses functional formulation for one_hot under vmap to be compatible with // fakeTensor/dynamic shapes and compiled functorch transforms. // mirrors the meta path in aten/src/ATen/native/Onehot.cpp, @@ -229,6 +238,42 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes const auto options = self.options(); at::Tensor index = at::arange(num_classes, options); return at::eq(self.unsqueeze(-1), index).to(at::kLong); +======= +// TODO: replace with targetable functionalization +static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) { + TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor."); + auto shape = self.sym_sizes().vec(); + + // empty tensor could be converted to one hot representation, + // but shape inference is not possible. + if (self.sym_numel() == 0) { + if (num_classes <= 0) { + TORCH_CHECK(false, "Can not infer total number of classes from empty tensor."); + } else { + shape.emplace_back(num_classes); + return at::empty_symint(shape, self.options()); + } + } + + TORCH_CHECK(num_classes > 0, "When vmap-ing torch.nn.functional.one_hot, please " + "provide an explicit positive num_classes argument."); + + // Disabling all of the following checks. This is OK because scatter has checks too. + // Maybe one_hot should be a primitive wrt autograd so we don't have to deal with this. + // // non-empty tensor + // if (self.device().type() != at::kCUDA) { + // //for cuda, rely on device assert thrown by scatter + // TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative."); + // } + // if (self.device().type() != at::kCUDA) { + // //rely on device asserts from scatter to avoid sync here + // TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes."); + // } + + shape.emplace_back(num_classes); + Tensor ret = at::zeros_symint(shape, self.options()); + return ret.scatter(-1, self.unsqueeze(-1), 1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp index de1a37a9b4320..f6854d5321285 100644 --- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp @@ -282,7 +282,11 @@ static std::tuple> _softmax_backward_batch_rule( dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim); +<<<<<<< HEAD // Not sure why output_ needs to be marked as .contiguous(). Something must +======= + // Not sure why output_ needs to be marked as .contiguous(). Someting must +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // have changed in PyTorch (and output of softmax is probably always contiguous) return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0); } diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp index f5c770371de8e..cefbc94a80204 100644 --- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp @@ -12,14 +12,21 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTBEGIN(bugprone-unchecked-optional-access) namespace at::functorch { namespace { +<<<<<<< HEAD bool any_has_value(ArrayRef> bdims) { +======= +static bool any_has_value(ArrayRef> bdims) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto& bdim : bdims) { if (bdim.has_value()) { return true; @@ -28,7 +35,11 @@ bool any_has_value(ArrayRef> bdims) { return false; } +<<<<<<< HEAD int64_t get_num_leading_nones(ArrayRef> indices) { +======= +static int64_t get_num_leading_nones(ArrayRef> indices) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t result = 0; for (const auto& idx : indices) { if (!idx.has_value() || !idx->defined()) { @@ -40,7 +51,11 @@ int64_t get_num_leading_nones(ArrayRef> indices) { return result; } +<<<<<<< HEAD int64_t get_max_index_logical_dim( +======= +static int64_t get_max_index_logical_dim( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ArrayRef> indices, ArrayRef> indices_bdims) { int64_t max_logical_dim = -1; @@ -57,7 +72,11 @@ int64_t get_max_index_logical_dim( return max_logical_dim; } +<<<<<<< HEAD std::vector> batchIndices( +======= +static std::vector> batchIndices( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::TensorOptions options, ArrayRef> indices, ArrayRef> indices_bdims, @@ -95,10 +114,16 @@ std::vector> batchIndices( if (index.has_value() && index->sym_numel() != 0) { const auto idx_bdim = indices_bdims[i]; indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank)); +<<<<<<< HEAD TORCH_CHECK( !(index.value().dtype() == kBool) || !indices_bdims[i].has_value(), "vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask." ); +======= + if (index.value().dtype() == kBool && indices_bdims[i].has_value()) { + throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask."); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { indices_.push_back(index); } @@ -126,7 +151,11 @@ std::vector> batchIndices( // Define an "advanced index" to be a selection object that is // a non-trivial Tensor (i.e. it does not represent :). +<<<<<<< HEAD bool is_advanced_index(const std::optional& idx) { +======= +static bool is_advanced_index(const std::optional& idx) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!idx.has_value()) { return false; } @@ -137,7 +166,11 @@ bool is_advanced_index(const std::optional& idx) { } // See NOTE: [advanced indices adjacent] for definition +<<<<<<< HEAD bool are_advanced_indices_adjacent(ArrayRef> indices) { +======= +static bool are_advanced_indices_adjacent(ArrayRef> indices) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t num_advanced_indices_regions = 0; bool in_advanced_indices_region = false; for (const auto& idx : indices) { @@ -165,7 +198,11 @@ bool are_advanced_indices_adjacent(ArrayRef> indices) { // - result: Tensor[B, 4, 5, 6, 2, 3, 7, 8] // ------- ---- // region2 region1 +<<<<<<< HEAD Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) { +======= +static Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) VmapDimVector permutation(tensor.dim(), 0); std::iota(permutation.begin(), permutation.end(), 0); std::rotate( @@ -553,7 +590,11 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List return self; } +<<<<<<< HEAD Tensor maybe_permute_values( +======= +static Tensor maybe_permute_values( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& values, ArrayRef> orig_indices, ArrayRef> orig_indices_bdims) { @@ -1052,7 +1093,11 @@ std::tuple> index_add_batch_rule( other, other_bdim, alpha, false); } +<<<<<<< HEAD std::tuple binary_pointwise_align( +======= +static std::tuple binary_pointwise_align( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor & self, std::optional self_bdim, const Tensor & mask, diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp index 48a735c3e5332..88c1fc755aae4 100644 --- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp @@ -171,8 +171,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { POINTWISE_BOXED(fill_.Scalar); POINTWISE_BOXED(zero_); +<<<<<<< HEAD // This is special because this op doesn't return anything m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #undef UNARY_POINTWISE #undef UNARY_POINTWISE_ALL diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp index 08db1d202b4eb..94a2629586c1b 100644 --- a/aten/src/ATen/functorch/BatchRulesViews.cpp +++ b/aten/src/ATen/functorch/BatchRulesViews.cpp @@ -346,7 +346,11 @@ std::tuple> slice_batch_rule( return std::make_tuple(std::move(result), 0); } +<<<<<<< HEAD bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +======= +static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return dim == 0 || dim == -1; } diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp index 92123c1cd0e22..664afede8aa9b 100644 --- a/aten/src/ATen/functorch/BatchedFallback.cpp +++ b/aten/src/ATen/functorch/BatchedFallback.cpp @@ -224,7 +224,11 @@ static Tensor safeStack(TensorList tensors) { // is possible for the backward function to return an undefined grad for some // grad_input for each example. In that case, we return an undefined grad. // +<<<<<<< HEAD // It is theoretically possible for *some* of the examples to produce an +======= + // It is theoretically posssible for *some* of the examples to produce an +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // undefined grad (a kernel could peek at the gradient values and return an // undefined tensor if it determines the gradient is full of zeros). We // could handle this by treating the undefined grad as a zero-filled tensor diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.cpp b/aten/src/ATen/functorch/BatchedTensorImpl.cpp index 895770fc69921..5969881a1ebac 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp +++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp @@ -113,7 +113,11 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const { return sym_sizes_default(); } +<<<<<<< HEAD // The following are publicly exposed as methods of Tensor +======= +// The following are publically exposed as methods of Tensor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) IntArrayRef BatchedTensorImpl::strides_custom() const { return strides_default(); @@ -126,7 +130,11 @@ SymIntArrayRef BatchedTensorImpl::sym_strides_custom() const { // TODO: implement proper contiguity on batched tensor, then put // sizes_strides_policy back to Default +<<<<<<< HEAD c10::SymBool BatchedTensorImpl::sym_is_contiguous_custom(at::MemoryFormat memory_format) const { +======= +bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(memory_format == MemoryFormat::Contiguous, "NYI: querying is_contiguous inside of vmap for memory_format ", "other than torch.contiguous_format"); diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h index 985b289b3fe02..c910c73210d01 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.h +++ b/aten/src/ATen/functorch/BatchedTensorImpl.h @@ -69,7 +69,11 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl { IntArrayRef strides_custom() const override; SymIntArrayRef sym_strides_custom() const override; // Override a bunch of methods inherited from TensorImpl to return error messages. +<<<<<<< HEAD c10::SymBool sym_is_contiguous_custom(at::MemoryFormat memory_format) const override; +======= + bool is_contiguous_custom(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const override; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void set_size(int64_t dim, int64_t new_size) override; void set_stride(int64_t dim, int64_t new_stride) override; c10::intrusive_ptr shallow_copy_and_detach( @@ -160,10 +164,13 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({ DispatchKey::CUDA, DispatchKey::CPU, DispatchKey::PrivateUse1, +<<<<<<< HEAD DispatchKey::SparseCPU, DispatchKey::SparseCUDA, DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA, +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) { diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp index 69af08a7bd7ce..a84f1b2c4113e 100644 --- a/aten/src/ATen/functorch/DynamicLayer.cpp +++ b/aten/src/ATen/functorch/DynamicLayer.cpp @@ -465,11 +465,19 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh) static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) { +<<<<<<< HEAD dynamicLayerBack(op, stack, true); } static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { dynamicLayerBack(op, stack, false); +======= + return dynamicLayerBack(op, stack, true); +} + +static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { + return dynamicLayerBack(op, stack, false); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) { diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h index 672a33fda0016..4d5e82a310cd5 100644 --- a/aten/src/ATen/functorch/DynamicLayer.h +++ b/aten/src/ATen/functorch/DynamicLayer.h @@ -37,7 +37,11 @@ namespace at::functorch { // how to perform the transform. // // TODO: we can excise DynamicLayer in favor of Interpreter, +<<<<<<< HEAD // But I am going to leave it for now as a compatibility shim to avoid +======= +// But I am going to leave it for now as a compatiblity shim to avoid +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // needing to refactor a lot of callsites... struct TORCH_API DynamicLayer { explicit DynamicLayer( diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h index 3d3b2069387d7..2879f3013e2b5 100644 --- a/aten/src/ATen/functorch/Interpreter.h +++ b/aten/src/ATen/functorch/Interpreter.h @@ -3,7 +3,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -88,7 +91,11 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t); // >>> VmapInterpreterPtr(&interpreter).batchSize() // // Finally, Interpreter::process switches on the type of the interpreter +<<<<<<< HEAD // and calls one of {Transform}Interpreter::processImpl under the hood. +======= +// and calls one of {Transform}Intepreter::processImpl under the hood. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Same for Interpreter::sendToNextInterpreter :) struct VmapInterpreterMeta { @@ -107,10 +114,16 @@ struct VmapInterpreterMeta { template friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) { +<<<<<<< HEAD TORCH_CHECK( !json_t.batchSize_.is_heap_allocated(), "Serialization for heap-allocated SymInt is not implemented yet" ); +======= + if (json_t.batchSize_.is_heap_allocated()) { + throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) json_j["batchSize"] = json_t.batchSize_.as_int_unchecked(); json_j["randomness"] = static_cast(json_t.randomness_); } @@ -304,7 +317,11 @@ struct Interpreter { } else if (meta.contains("Functionalize")) { json_t.meta_.emplace(meta["Functionalize"].template get()); } else { +<<<<<<< HEAD TORCH_CHECK(false, "unknown interpreter metadata type"); +======= + throw std::runtime_error("unknown interpreter metadata type"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp index 0c2ed37d23765..6816f5740a6e8 100644 --- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp @@ -68,18 +68,30 @@ namespace at::functorch { namespace{ // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor. +<<<<<<< HEAD bool is_allowed_dim_on_scalar_tensor(int64_t dim) { return dim == 0 || dim == -1; } int64_t get_current_level() { +======= +static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { + return dim == 0 || dim == -1; +} + +static int64_t get_current_level() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto maybe_level = maybeCurrentDynamicLayer(); TORCH_INTERNAL_ASSERT(maybe_level.has_value()); return maybe_level->layerId(); } // This check should probably go into the dispatcher... +<<<<<<< HEAD bool participatesInCurrentLevel(const Tensor& self) { +======= +static bool participatesInCurrentLevel(const Tensor& self) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto current_level = get_current_level(); auto* maybe_batched_impl = maybeGetBatchedImpl(self); if (!maybe_batched_impl) { @@ -90,7 +102,11 @@ bool participatesInCurrentLevel(const Tensor& self) { return self_level == current_level; } +<<<<<<< HEAD bool participatesInCurrentLevel(ITensorListRef self) { +======= +static bool participatesInCurrentLevel(ITensorListRef self) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const Tensor& tensor : self) { if (participatesInCurrentLevel(tensor)) { return true; @@ -285,7 +301,11 @@ std::vector unbind_batching_rule(const Tensor& self, int64_t dim) { // given (sizes, strides, storage_offset) returns the maximum location that // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors // with zero-size dims). +<<<<<<< HEAD std::optional maximum_indexable_location( +======= +static std::optional maximum_indexable_location( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) { auto result = native::storage_size_for(sizes, strides); if (result == 0) { @@ -298,7 +318,11 @@ std::optional maximum_indexable_location( // This checks that the range of possible memory locations accessible by // x.as_strided(sizes, strides, maybe_storage_offset) // are within the bounds of possible memory locations accessible by x. +<<<<<<< HEAD void checkBasicAsStridedValidForSlice( +======= +static void checkBasicAsStridedValidForSlice( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& physical_tensor, int64_t num_batch_dims, c10::SymIntArrayRef sizes, @@ -733,7 +757,11 @@ TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) { } TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { +<<<<<<< HEAD // still legacy b/c returns multiple tensors +======= + // still legacy b/c teturns multiple tensors +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) m.impl("split.Tensor", split_batching_rule); m.impl("split_with_sizes", split_with_sizes_batching_rule); m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule); diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp index 667e92970033c..18d6ef008019e 100644 --- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp +++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp @@ -6,7 +6,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -71,7 +74,11 @@ Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optiona return output; } +<<<<<<< HEAD inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { +======= +static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (reduction == at::Reduction::Mean) { return unreduced.mean(); } else if (reduction == at::Reduction::Sum) { @@ -109,7 +116,13 @@ Tensor binary_cross_entropy_with_logits_hack( } Tensor trace_backward_decomp(const Tensor& grad, IntArrayRef sizes) { +<<<<<<< HEAD TORCH_CHECK(sizes.size() == 2, "expected matrix input"); +======= + if (sizes.size() != 2) { + throw std::runtime_error("expected matrix input"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options()); auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong)); // Workaround using index_put instead of yet unsupported index_fill_ @@ -127,7 +140,11 @@ namespace { template using Ctype = std::conditional_t; +<<<<<<< HEAD Tensor make_feature_noise(const Tensor& input) { +======= +static Tensor make_feature_noise(const Tensor& input) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto input_sizes = input.sizes(); TORCH_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input"); std::vector sizes; @@ -141,7 +158,11 @@ Tensor make_feature_noise(const Tensor& input) { return at::empty(sizes, input.options()); } +<<<<<<< HEAD bool is_fused_kernel_acceptable(const Tensor& input, double p) { +======= +static bool is_fused_kernel_acceptable(const Tensor& input, double p) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (input.is_cuda() || input.is_xpu() || input.is_lazy() || input.is_privateuseone()) && p > 0 && p < 1 && input.numel() > 0; } @@ -210,7 +231,11 @@ ALIAS_SPECIALIZATION(_feature_dropout, true, false) ALIAS_SPECIALIZATION(_alpha_dropout, false, true ) ALIAS_SPECIALIZATION(_feature_alpha_dropout, true, true ) +<<<<<<< HEAD Tensor dropout(const Tensor& input, double p, bool train) { +======= +static Tensor dropout(const Tensor& input, double p, bool train) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto result = [&]() { NoNamesGuard guard; if (train && is_fused_kernel_acceptable(input, p)) { diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index cfdecaac778b3..b72da1fda9b6f 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -37,6 +37,7 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->copy_data(dest, src, count); } +<<<<<<< HEAD // From DeviceAllocator bool initialized() override { @@ -64,6 +65,8 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->resetPeakStats(device); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // From CUDAAllocator void* raw_alloc(size_t nbytes) override { @@ -82,6 +85,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->init(device_count); } +<<<<<<< HEAD +======= + bool initialized() override { + return allocator_->initialized(); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) double getMemoryFraction(c10::DeviceIndex device) override { return allocator_->getMemoryFraction(device); } @@ -90,8 +100,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->setMemoryFraction(fraction, device); } +<<<<<<< HEAD std::vector getExpandableSegmentSizes(c10::DeviceIndex device) override { return allocator_->getExpandableSegmentSizes(device); +======= + void emptyCache(MempoolId_t mempool_id = {0, 0}) override { + allocator_->emptyCache(mempool_id); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void enable(bool value) override { @@ -114,6 +129,21 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->recordStream(ptr, stream); } +<<<<<<< HEAD +======= + CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override { + return allocator_->getDeviceStats(device); + } + + void resetAccumulatedStats(c10::DeviceIndex device) override { + allocator_->resetAccumulatedStats(device); + } + + void resetPeakStats(c10::DeviceIndex device) override { + allocator_->resetPeakStats(device); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override { return allocator_->snapshot(mempool_id); } diff --git a/aten/src/ATen/metal/Context.h b/aten/src/ATen/metal/Context.h index e4c6da738e0db..b5379c9b257a4 100644 --- a/aten/src/ATen/metal/Context.h +++ b/aten/src/ATen/metal/Context.h @@ -18,7 +18,11 @@ extern std::atomic g_metal_impl_registry; class MetalImplRegistrar { public: +<<<<<<< HEAD explicit MetalImplRegistrar(MetalInterface* /*impl*/); +======= + explicit MetalImplRegistrar(MetalInterface*); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }; at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src); diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index 6c58de099648d..e8c178b18f00d 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -12,7 +12,11 @@ #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled" #define MPS_ERROR_RUNTIME_TOO_LOW \ +<<<<<<< HEAD "The MPS backend is supported on MacOS 14.0+. ", \ +======= + "The MPS backend is supported on MacOS 13.0+.", \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "Current OS version can be queried using `sw_vers`" #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \ "as the MPS framework doesn't support float64. Please use float32 instead." @@ -43,6 +47,10 @@ TensorBase empty_mps( int64_t nelements = c10::multiply_integers(size); auto dtype = dtype_or_default(dtype_opt); TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED); +<<<<<<< HEAD +======= + TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto dtype_meta = scalarTypeToTypeMeta(dtype); diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h index 9b58477104978..9075e733eed99 100644 --- a/aten/src/ATen/mps/MPSDevice.h +++ b/aten/src/ATen/mps/MPSDevice.h @@ -18,7 +18,15 @@ namespace at::mps { // Helper enum to check if a MPSGraph op is supported in a given macOS version enum class MacOSVersion : uint32_t { +<<<<<<< HEAD MACOS_VER_14_4_PLUS = 0, +======= + MACOS_VER_13_1_PLUS = 0, + MACOS_VER_13_2_PLUS, + MACOS_VER_13_3_PLUS, + MACOS_VER_14_0_PLUS, + MACOS_VER_14_4_PLUS, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) MACOS_VER_15_0_PLUS, MACOS_VER_15_1_PLUS, MACOS_VER_15_2_PLUS, @@ -55,6 +63,7 @@ class TORCH_API MPSDevice { */ bool isMacOS13Plus(MacOSVersion version) const; +<<<<<<< HEAD /** * Returns device name */ @@ -66,6 +75,8 @@ class TORCH_API MPSDevice { */ unsigned getCoreCount() const; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ~MPSDevice(); private: diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm index 5a37490c02402..39ddd7c1ceec0 100644 --- a/aten/src/ATen/mps/MPSDevice.mm +++ b/aten/src/ATen/mps/MPSDevice.mm @@ -32,11 +32,19 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de MPSDevice::MPSDevice() : _mtl_device(nil) { // Check that MacOS 13.0+ version of MPS framework is available +<<<<<<< HEAD // Create the MPSGraph and check method introduced in 14.0 // which is used by MPS backend. id mpsCD = NSClassFromString(@"MPSGraph"); if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) { +======= + // Create the MPSGraph and check method introduced in 13.0 + // which is used by MPS backend. + id mpsCD = NSClassFromString(@"MPSGraph"); + + if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return; } @@ -66,12 +74,30 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}]; } }; +<<<<<<< HEAD +======= + static bool _macos_13_1_plus = is_os_version_at_least(13, 1); + static bool _macos_13_2_plus = is_os_version_at_least(13, 2); + static bool _macos_13_3_plus = is_os_version_at_least(13, 3); + static bool _macos_14_0_plus = is_os_version_at_least(14, 0); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static bool _macos_14_4_plus = is_os_version_at_least(14, 4); static bool _macos_15_0_plus = is_os_version_at_least(15, 0); static bool _macos_15_1_plus = is_os_version_at_least(15, 1); static bool _macos_15_2_plus = is_os_version_at_least(15, 2); switch (version) { +<<<<<<< HEAD +======= + case MacOSVersion::MACOS_VER_13_1_PLUS: + return _macos_13_1_plus; + case MacOSVersion::MACOS_VER_13_2_PLUS: + return _macos_13_2_plus; + case MacOSVersion::MACOS_VER_13_3_PLUS: + return _macos_13_3_plus; + case MacOSVersion::MACOS_VER_14_0_PLUS: + return _macos_14_0_plus; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case MacOSVersion::MACOS_VER_14_4_PLUS: return _macos_14_4_plus; case MacOSVersion::MACOS_VER_15_0_PLUS: @@ -85,6 +111,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de } } +<<<<<<< HEAD std::string MPSDevice::getName() const { @autoreleasepool { return [[_mtl_device name] UTF8String]; @@ -115,6 +142,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de at::Allocator* GetMPSAllocator(bool useSharedAllocator) { return getIMPSAllocator(useSharedAllocator); } +======= +at::Allocator* GetMPSAllocator(bool useSharedAllocator) { + return getIMPSAllocator(useSharedAllocator); +} + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool is_available() { return MPSDevice::getInstance()->device() != nil; } diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm index 34fbd31af91da..812a09750c225 100644 --- a/aten/src/ATen/mps/MPSHooks.mm +++ b/aten/src/ATen/mps/MPSHooks.mm @@ -34,7 +34,11 @@ case 14: switch (minor) { case 0: +<<<<<<< HEAD return true; +======= + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case 4: return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS); default: @@ -42,7 +46,23 @@ return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS); } case 13: +<<<<<<< HEAD return true; +======= + switch (minor) { + case 0: + return true; + case 1: + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS); + case 2: + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS); + case 3: + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); + default: + TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+"); + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) default: TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false"); return false; @@ -70,10 +90,14 @@ } void* MPSHooks::getCommandBuffer() const { +<<<<<<< HEAD auto stream = at::mps::getDefaultMPSStream(); // Release pending computeCommandEncoder, as extensions is likely to allocate new one stream->endKernelCoalescing(); return stream->commandBuffer(); +======= + return at::mps::getDefaultMPSStream()->commandBuffer(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void* MPSHooks::getDispatchQueue() const { diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm index 595d71aeef15a..c53089c67d5f5 100644 --- a/aten/src/ATen/mps/MPSStream.mm +++ b/aten/src/ATen/mps/MPSStream.mm @@ -158,6 +158,7 @@ @interface MPSGraphExecutionDescriptor () endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; +<<<<<<< HEAD // For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26 // See https://github.com/pytorch/pytorch/issues/163962 // Workaround by batching copy commands into 4Gb chunks @@ -170,6 +171,9 @@ @interface MPSGraphExecutionDescriptor () bytes_filled += bytes_to_copy; bytes_remains -= bytes_to_copy; } +======= + [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) [blitEncoder endEncoding]; synchronize(syncType); } diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index c164120a1f3c4..b6c3e0e76c983 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -240,8 +240,13 @@ TORCH_META_FUNC(gelu_backward) ( namespace at::native { +<<<<<<< HEAD static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717; static constexpr double SELU_SCALE = 1.0507009873554804934193349852946; +======= +static const double SELU_ALPHA = 1.6732632423543772848170429916717; +static const double SELU_SCALE = 1.0507009873554804934193349852946; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) DEFINE_DISPATCH(elu_stub); DEFINE_DISPATCH(elu_backward_stub); @@ -670,8 +675,11 @@ Tensor rrelu_with_noise_backward( } Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional generator) { +<<<<<<< HEAD TORCH_CHECK(std::isfinite(lower.to()), "rrelu: lower bound must be finite, got ", lower.to()); TORCH_CHECK(std::isfinite(upper.to()), "rrelu: upper bound must be finite, got ", upper.to()); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(lower.to() <= upper.to(), "Lower bound should be less than or equal to the upper bound") auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator)); diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp index 5821cd561cdf1..1e0b36a735f2a 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp @@ -24,7 +24,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD void adaptive_avg_pool3d_out_frame( +======= +static void adaptive_avg_pool3d_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input_p, scalar_t* output_p, int64_t sizeD, @@ -176,7 +180,11 @@ void adaptive_avg_pool3d_out_cpu_template( } template +<<<<<<< HEAD void adaptive_avg_pool3d_backward_out_frame( +======= +static void adaptive_avg_pool3d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* gradInput_p, const scalar_t* gradOutput_p, int64_t sizeD, diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp index ef4bab3ec1de0..436ac6d66b2ca 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp @@ -93,7 +93,11 @@ namespace { // 5d tensor B x D x T x H x W template +<<<<<<< HEAD void adaptive_max_pool3d_single_out_frame( +======= +static void adaptive_max_pool3d_single_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *input_p, scalar_t *output_p, int64_t *ind_p, @@ -170,7 +174,11 @@ void adaptive_max_pool3d_single_out_frame( } template +<<<<<<< HEAD void adaptive_max_pool3d_out_frame( +======= +static void adaptive_max_pool3d_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *input_data, scalar_t *output_data, int64_t *indices_data, @@ -202,7 +210,11 @@ void adaptive_max_pool3d_out_frame( } template +<<<<<<< HEAD void adaptive_max_pool3d_backward_single_out_frame( +======= +static void adaptive_max_pool3d_backward_single_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t *gradInput_p, const scalar_t *gradOutput_p, const int64_t *ind_p, @@ -241,7 +253,11 @@ void adaptive_max_pool3d_backward_single_out_frame( } template +<<<<<<< HEAD void adaptive_max_pool3d_backward_out_frame( +======= +static void adaptive_max_pool3d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t *gradInput_data, const scalar_t *gradOutput_data, const int64_t *indices_data, diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp index 365cfa311512a..8a74920373edf 100644 --- a/aten/src/ATen/native/AveragePool3d.cpp +++ b/aten/src/ATen/native/AveragePool3d.cpp @@ -153,7 +153,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD void avg_pool3d_out_frame( +======= +static void avg_pool3d_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *input_p, scalar_t *output_p, int64_t nslices, @@ -333,7 +337,11 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) ( namespace { template +<<<<<<< HEAD void avg_pool3d_backward_out_frame( +======= +static void avg_pool3d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t *gradInput_p, const scalar_t *gradOutput_p, int64_t nslices, diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 6669357cda456..941ecc32399f9 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -2060,7 +2060,11 @@ std::tuple linalg_lu_factor(const Tensor& A, bool pivot) { } // TODO Deprecate this function in favour of linalg_lu_factor_ex +<<<<<<< HEAD std::tuple _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) { +======= +std::tuple _lu_with_info(const Tensor& self, bool compute_pivots, bool) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_ONCE( "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ", "removed in a future PyTorch release.\n", @@ -2453,7 +2457,11 @@ TORCH_IMPL_FUNC(linalg_qr_out)(const Tensor& A, // geqrf requires m x n workspace input that is modified in-place // We try to use Q. If it doesn't fit, we try to use R +<<<<<<< HEAD // If m > n and compute_q==false, it won't fit into Q or R, so we need to create an auxiliary tensor +======= + // If m > n and compute_q==false, it won't fit into Q or R, so we neet to create an auxiliary tensor +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor QR; if (compute_q && Q.size(-1) == n) { QR = Q; @@ -4095,7 +4103,11 @@ Tensor linalg_vander_symint( const auto n = N.value_or(shape.back()); TORCH_CHECK(n > 1, "N must be greater than 1."); +<<<<<<< HEAD // Append cumprod of the other 0...n-1 powers +======= + // Append cumprod of the oher 0...n-1 powers +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) shape.push_back(n - 1); auto result = at::cumprod(x_.unsqueeze(-1).expand_symint(shape), -1); // The row of ones diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index df64aa42e602f..acc4cfe4044d8 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -143,13 +143,21 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper) For more info see https://github.com/pytorch/pytorch/issues/145801#issuecomment-2631781776 */ template +<<<<<<< HEAD inline +======= +static inline +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::enable_if_t, int> lapack_work_to_int(const T val) { const auto next_after = std::nextafter(val, std::numeric_limits::infinity()); return std::max(1, std::ceil(next_after)); } template +<<<<<<< HEAD inline +======= +static inline +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::enable_if_t::value, int> lapack_work_to_int(const T val) { return lapack_work_to_int(val.real()); } @@ -343,7 +351,11 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c For further details, please see the LAPACK documentation for GEQRF. */ template +<<<<<<< HEAD void apply_geqrf(const Tensor& input, const Tensor& tau) { +======= +static void apply_geqrf(const Tensor& input, const Tensor& tau) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if !AT_BUILD_WITH_LAPACK() TORCH_CHECK( false, @@ -1039,7 +1051,11 @@ void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tr } template +<<<<<<< HEAD void apply_svd(const Tensor& A, +======= +static void apply_svd(const Tensor& A, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const bool full_matrices, const bool compute_uv, const Tensor& U, diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 6b7496f49732e..4fdd239f18294 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -9,7 +9,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #if !defined(__s390x__) && !defined(__powerpc__) #include #endif @@ -58,7 +61,11 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, template scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy); +<<<<<<< HEAD static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) { +======= +static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return n == 1 || lda >= std::max(1L, m); } @@ -333,6 +340,7 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b, return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); } +<<<<<<< HEAD // TODO(vasiliy, future PR): figure out why we need to declare this function, when // other functions that live in ATen/native/*.cpp without declarations // or headers work just fine. @@ -352,4 +360,6 @@ std::optional out_dtype) { return out; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::native diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp index b476ca3cff8f1..69e36ebb62265 100644 --- a/aten/src/ATen/native/BlasKernel.cpp +++ b/aten/src/ATen/native/BlasKernel.cpp @@ -286,7 +286,11 @@ template void scal_fast_path(int *n, scalar_t *a, scalar_t *x, int *in #if AT_BUILD_WITH_BLAS() template <> bool scal_use_fast_path(int64_t n, int64_t incx) { +<<<<<<< HEAD auto constexpr intmax = std::numeric_limits::max(); +======= + auto intmax = std::numeric_limits::max(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return n <= intmax && incx <= intmax; } @@ -315,7 +319,11 @@ bool gemv_use_fast_path( int64_t incx, [[maybe_unused]] float beta, int64_t incy) { +<<<<<<< HEAD auto constexpr intmax = std::numeric_limits::max(); +======= + auto intmax = std::numeric_limits::max(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return (m <= intmax) && (n <= intmax) && (lda <= intmax) && (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax); } @@ -375,7 +383,11 @@ static void bf16_gemv_trans( const at::BFloat16 beta, at::BFloat16* y, const int incy) { +<<<<<<< HEAD bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy); +======= + return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template <> diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h index bd19f9c987f14..e7a038e37ffb6 100644 --- a/aten/src/ATen/native/BucketizationUtils.h +++ b/aten/src/ATen/native/BucketizationUtils.h @@ -70,7 +70,11 @@ inline void searchsorted_maybe_trim_input_tensors( const Tensor& raw_boundaries) { Tensor trimmed_sorter; Tensor raw_sorter; +<<<<<<< HEAD searchsorted_maybe_trim_input_tensors( +======= + return searchsorted_maybe_trim_input_tensors( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) trimmed_input, trimmed_boundaries, trimmed_sorter, diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index c17a70ea308ab..b3852d40f6fb0 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -51,7 +51,11 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int * // brgemm_pack_B is changed to transform and the setting of brgemm beta is changed to set_add_C #if (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR == 5) #define ONEDNN_UKERNEL_1 +<<<<<<< HEAD #elif ((IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR >= 6) || (IDEEP_VERSION_MAJOR > 3)) +======= +#elif (IDEEP_VERSION_MAJOR >= 3 && IDEEP_VERSION_MINOR >= 6) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define ONEDNN_UKERNEL_2 #endif #if ((defined(ONEDNN_UKERNEL_1) || defined(ONEDNN_UKERNEL_2)) && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))) @@ -202,7 +206,11 @@ void gemm( float *c, int64_t ldc) { internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); #if AT_MKLDNN_ENABLED() +<<<<<<< HEAD if (mkldnn_reduced_f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) { +======= + if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return; } #endif @@ -358,6 +366,7 @@ void gemm( int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; char transa_ = to_blas(transa), transb_ = to_blas(transb); float alpha_ = alpha, beta_ = beta; +<<<<<<< HEAD int c_size = n_ * m_; // C matrix in OpenBLAS sbgemm are of type "float" so we have to convert, copy and copy back. std::vector float_v(c_size, 0.0f); @@ -366,17 +375,28 @@ void gemm( float_v[j * m_ + i] = c10::convert(c[j * ldc_ + i]); } } +======= + int c_size = n_ * ldc_; + // C matrix in OpenBLAS sbgemm are of type "float" so we have to convert, copy and copy back. + std::vector float_v(c, c + c_size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sbgemm_(&transa_, &transb_, &m_, &n_, &k_, &alpha_, a, &lda_, b, &ldb_, &beta_, +<<<<<<< HEAD float_v.data(), &m_); for (const auto j : c10::irange(n)) { for (const auto i : c10::irange(m)) { c[j * ldc_ + i] = c10::convert(float_v[j * m_ + i]); } +======= + float_v.data(), &ldc_); + for (auto cv: float_v) { + *(c++) = c10::convert(cv); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return; } @@ -457,9 +477,30 @@ void gemm( return; } #endif +<<<<<<< HEAD gemm_no_downcast_stub( at::kCPU, at::kBFloat16, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +======= + // for the fallback path, first compute gemm with beta = 0, + // and then add c in full precision. + int64_t c_size = n * m; + std::vector float_c(c_size, 0.f); + gemm_no_downcast_stub( + at::kCPU, at::kBFloat16, + transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); + for (const auto j : c10::irange(n)) { + for (const auto i : c10::irange(m)) { + auto offset = j * ldc + i; + // beta == 0 won't propagate NaN from C + if (beta == 0.f) { + c[offset] = float_c[j * m + i]; + } else { + c[offset] = beta * c[offset] + float_c[j * m + i]; + } + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void gemm( @@ -478,9 +519,30 @@ void gemm( return; } #endif +<<<<<<< HEAD gemm_no_downcast_stub( at::kCPU, at::kHalf, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +======= + // for the fallback path, first compute gemm with beta = 0, + // and then add c in full precision. + int64_t c_size = n * m; + std::vector float16_c(c_size, 0.f); + gemm_stub( + at::kCPU, at::kHalf, + transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m); + for (const auto j : c10::irange(n)) { + for (const auto i : c10::irange(m)) { + auto offset = j * ldc + i; + // beta == 0 won't propagate NaN from C + if (beta == 0.f) { + c[offset] = c10::convert(float16_c[j * m + i]); + } else { + c[offset] = beta * c[offset] + c10::convert(float16_c[j * m + i]); + } + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void gemm( @@ -991,7 +1053,11 @@ std::size_t UnsafeUkernelKeyHasher::operator()(const PackKey& key) cons template struct KernelCache { using kstore_t = std::unordered_map, UnsafeUkernelKeyHasher>; +<<<<<<< HEAD static std::shared_ptr&& fetch_or_create( +======= + static inline std::shared_ptr&& fetch_or_create( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const key_t& key, const std::function()>& callback) { auto&& search = get_store().find(key); @@ -1003,7 +1069,11 @@ struct KernelCache { } } +<<<<<<< HEAD static kstore_t& get_store() { +======= + static inline kstore_t& get_store() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static thread_local kstore_t cache_kernels; return cache_kernels; } @@ -1067,7 +1137,11 @@ struct GemmHelper { struct Brgemm : public KernelCache { // Fetch/create GemmHelper object and execute brgemm with batch size = 1 template +<<<<<<< HEAD static void call( +======= + static inline void call( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t M, int64_t N, int64_t K, @@ -1118,12 +1192,20 @@ struct Brgemm : public KernelCache { .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data()); } +<<<<<<< HEAD static std::shared_ptr& get_current() { +======= + static inline std::shared_ptr& get_current() { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static thread_local std::shared_ptr current; return current; } +<<<<<<< HEAD static bool device_check(ScalarType dtype) { +======= + static inline bool device_check(ScalarType dtype) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!at::globalContext().userEnabledMkldnn()) { return false; } @@ -1153,7 +1235,11 @@ using pack_t = dnnl::ukernel::brgemm_pack_B; using pack_t = dnnl::ukernel::transform; #endif struct Pack : public KernelCache { +<<<<<<< HEAD static void call( +======= + static inline void call( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t K, int64_t N, int64_t ld_in, @@ -1182,7 +1268,11 @@ struct Pack : public KernelCache { } } +<<<<<<< HEAD static bool could_pack(ScalarType dtype) { +======= + static inline bool could_pack(ScalarType dtype) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!at::globalContext().userEnabledMkldnn()) { return false; } diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h index 8b75f12ebaf21..8512af333fb8e 100644 --- a/aten/src/ATen/native/CPUBlas.h +++ b/aten/src/ATen/native/CPUBlas.h @@ -206,6 +206,7 @@ void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex *x, int64_t incx, c10::complex int32 #define CPUBLAS_BRGEMM_I8I8I32 // signed char * signed char -> int32 +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_API void brgemm( int64_t M, int64_t N, diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp index d043014b3820e..b2ec8db631da7 100644 --- a/aten/src/ATen/native/ChanelShuffle.cpp +++ b/aten/src/ATen/native/ChanelShuffle.cpp @@ -81,7 +81,11 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) { // TODO: contiguous can be made to preserve the memory format // of the input. However since the above reshape clobbers h and w // it may not be safe to do that, since channels_last contiguous +<<<<<<< HEAD // may think oc and the last dim correspond to h,w? +======= + // may think oc and and the last dim correspond to h,w? +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // It is not clear, however from initial looking around it feels that // this may not be correct. // In this case channels last will likely require custom implementation diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp index f0270a02b2677..f496f15512a3c 100644 --- a/aten/src/ATen/native/Col2Im.cpp +++ b/aten/src/ATen/native/Col2Im.cpp @@ -71,7 +71,11 @@ namespace at::native { namespace { +<<<<<<< HEAD void col2im_out_cpu_template( +======= +static void col2im_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& output, const Tensor& input_, IntArrayRef output_size, diff --git a/aten/src/ATen/native/ComparisonUtils.cpp b/aten/src/ATen/native/ComparisonUtils.cpp index 13bef0a00b9c9..3c7134cd18617 100644 --- a/aten/src/ATen/native/ComparisonUtils.cpp +++ b/aten/src/ATen/native/ComparisonUtils.cpp @@ -24,6 +24,7 @@ static void _assert_match(const O& original, const C& compared, const std::strin } } +<<<<<<< HEAD template<> void _assert_match>( const c10::Device& original, @@ -47,6 +48,8 @@ void _assert_match>( } } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional dtype, std::optional device, std::optional layout) { _assert_match(tensor.sym_sizes(), sizes, "sizes"); _assert_match(tensor.sym_strides(), strides, "strides"); diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 892144ac663a6..c09bea137751c 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -465,11 +465,16 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor return false; } +<<<<<<< HEAD auto is_channel_last = [](const at::Tensor& t) { auto fmt = t.suggest_memory_format(); return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; }; return is_channel_last(input) || is_channel_last(weight); +======= + auto fmt = input.suggest_memory_format(); + return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // namespace at::native diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 610f454be21fa..db1e7f66fa865 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -3,7 +3,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -14,7 +17,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -32,6 +38,13 @@ #include #endif +<<<<<<< HEAD +======= +#ifdef USE_MPS +#include +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include #include @@ -297,6 +310,7 @@ struct ConvParams { bool allow_tf32{}; bool is_strided() const { +<<<<<<< HEAD return std::any_of( stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; }); } @@ -341,6 +355,69 @@ struct ConvParams { bool is_stride_nonpos() const { return std::any_of( stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; }); +======= + bool is_strided = false; + for (const auto& s : stride) { + is_strided |= (s != 1); + } + return is_strided; + } + + bool is_dilated() const { + bool is_dilated = false; + for (const auto& d : dilation) { + is_dilated |= (d != 1); + } + return is_dilated; + } + + bool is_padded() const { + bool is_padded = false; + for (auto p : padding) { + is_padded |= (p != 0); + } + return is_padded; + } + + bool is_output_padding_neg() const { + bool is_non_neg = false; + for (const auto& p : output_padding) { + is_non_neg |= (p < 0); + } + return is_non_neg; + } + + bool is_output_padding_big() const { + bool is_big = false; + for (auto i: c10::irange(output_padding.size())) { + is_big |= (output_padding[i] >= stride[i]); + } + return is_big; + } + + bool is_padding_neg() const { + bool is_non_neg = false; + for (const auto& p : padding) { + is_non_neg |= (p < 0); + } + return is_non_neg; + } + + bool is_dilation_neg() const { + bool is_non_neg = false; + for (const auto& p : dilation) { + is_non_neg |= (p < 0); + } + return is_non_neg; + } + + bool is_stride_nonpos() const { + bool is_nonpos = false; + for (const auto& s : stride) { + is_nonpos |= (s <= 0); + } + return is_nonpos; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } void view1d_as_2d() { @@ -406,6 +483,7 @@ struct ConvParams { // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how) #if !defined(C10_MOBILE) +<<<<<<< HEAD if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { return false; } @@ -423,6 +501,13 @@ struct ConvParams { } } if (needs_64bit_indexing_no_split(input, weight)) { +======= + if (!detail::getCUDAHooks().compiledWithCuDNN()) { + return false; + } + if (needs_64bit_indexing_no_split(input, weight)) { + static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" " if the V8 API is not enabled or before cuDNN version 9.3+." @@ -430,6 +515,12 @@ struct ConvParams { return false; } } +<<<<<<< HEAD +======= + if (!input.is_cuda() || !cudnn_enabled) { + return false; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) { return false; @@ -448,6 +539,7 @@ struct ConvParams { // Use cudnn for FP16 depthwise convolutions bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const { +<<<<<<< HEAD if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) { return false; } @@ -461,6 +553,18 @@ struct ConvParams { } } +======= + if (!detail::getCUDAHooks().compiledWithCuDNN()) { + return false; + } + if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) { + // always use cudnn_depthwise for channels_last format + return true; + } + // native kernel doesn't support 64-bit non-splittable case + if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) { + static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" " if the V8 API is not enabled or before cuDNN version 9.3+." @@ -470,10 +574,13 @@ struct ConvParams { return true; } } +<<<<<<< HEAD if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { // always use cudnn_depthwise for channels_last format return true; } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { bool kernel_cond = (use_cudnn(input, weight) && input.scalar_type() == kHalf && // only for FP16 @@ -658,7 +765,10 @@ static void check_shape_forward(const at::Tensor& input, TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported"); TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported"); TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero"); +<<<<<<< HEAD TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(weight_dim == k, "Expected ", weight_dim, "-dimensional input for ", weight_dim, @@ -689,10 +799,13 @@ static void check_shape_forward(const at::Tensor& input, ", but got bias of size ", at::symint::sizes(bias), " instead"); for (const auto i : c10::irange(2, k)) { +<<<<<<< HEAD // T could be int64_t or SymInt, Specialized numeric_limts in c10/core/SymInt.h TORCH_CHECK(padding[i-2] <= (std::numeric_limits::max() - padding[i-2]), "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ", (std::numeric_limits::max() / 2)); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) input_shape.push_back(at::symint::size(input, i) + 2 * padding[i-2]); // log new kernel size considering dilation kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1); @@ -707,7 +820,11 @@ static void check_shape_forward(const at::Tensor& input, // If kernel size is incorrect std::ostringstream input_ss; std::ostringstream kernel_ss; +<<<<<<< HEAD std::string separator; +======= + std::string separator = ""; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (int i = 0, len = input_shape.size(); i < len; ++i) { input_ss << separator << input_shape[i]; @@ -719,11 +836,14 @@ static void check_shape_forward(const at::Tensor& input, "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size"); } } else { // transposed +<<<<<<< HEAD for (const auto i : c10::irange(2, k)) { TORCH_CHECK(padding[i-2] <= (std::numeric_limits::max() - padding[i-2]), "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ", (std::numeric_limits::max() / 2)); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(at::symint::size(input, 1) == weight_sizes[0], "Given transposed=", transposed, ", weight of size ", weight_sizes, ", expected input", at::symint::sizes(input), " to have ", weight_sizes[0], @@ -1029,7 +1149,11 @@ static Tensor convolution_same( if (symmetric_padding) { // All backends handle symmetric padding natively +<<<<<<< HEAD SymDimVector output_padding(dim); +======= + SymDimVector output_padding(static_cast(dim)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::convolution_symint(input, weight, bias, stride, padding_l, dilation, false, output_padding, groups); } @@ -1049,7 +1173,11 @@ static Tensor convolution_same( } } auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0); +<<<<<<< HEAD SymDimVector output_padding(dim); +======= + SymDimVector output_padding(static_cast(dim)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::convolution_symint(padded_input, weight, bias, stride, padding_l, dilation, false, output_padding, groups); } @@ -1184,7 +1312,11 @@ at::Tensor convolution( bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); return at::_convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, +<<<<<<< HEAD ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV)); +======= + ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } at::Tensor convolution_overrideable( @@ -1329,7 +1461,11 @@ ConvBackend select_conv_backend( params.benchmark = ctx.benchmarkCuDNN(); params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); params.cudnn_enabled = ctx.userEnabledCuDNN(); +<<<<<<< HEAD params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); +======= + params.allow_tf32 = ctx.allowTF32CuDNN(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto input = input_r; auto weight = weight_r; @@ -1451,8 +1587,17 @@ static inline at::MemoryFormat determine_backend_memory_format( } break; case ConvBackend::Mps: +<<<<<<< HEAD case ConvBackend::MpsTranspose: if (mps_conv_use_channels_last(input, weight)) { +======= + if (mps_conv_use_channels_last(input, weight)) { +#ifdef USE_MPS + if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) { + break; + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast; } break; @@ -1709,7 +1854,11 @@ at::Tensor _convolution( c10::MaybeOwned bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt); const Tensor& bias_r = *bias_r_maybe_owned; +<<<<<<< HEAD return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV)); +======= + return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } std::tuple convolution_backward_overrideable( @@ -2007,7 +2156,11 @@ std::tuple convolution_backward( params.benchmark = ctx.benchmarkCuDNN(); params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); params.cudnn_enabled = ctx.userEnabledCuDNN(); +<<<<<<< HEAD params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); +======= + params.allow_tf32 = ctx.allowTF32CuDNN(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Validate inputs. check_shape_backward(input, weight.sizes(), params); diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 538a893d54ea0..16ab0dc2d0ea5 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -25,7 +25,11 @@ namespace at::native { namespace { +<<<<<<< HEAD Tensor compute_columns2d( +======= +static Tensor compute_columns2d( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, IntArrayRef padding, IntArrayRef stride, @@ -93,7 +97,11 @@ Tensor compute_columns2d( return columns.contiguous(); } +<<<<<<< HEAD inline void slow_conv2d_shape_check( +======= +static inline void slow_conv2d_shape_check( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& grad_output, const Tensor& weight, @@ -205,7 +213,11 @@ inline void slow_conv2d_shape_check( } } +<<<<<<< HEAD inline Tensor view_weight_2d(const Tensor& weight_, +======= +static inline Tensor view_weight_2d(const Tensor& weight_, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) { Tensor weight = weight_.contiguous(memory_format); if (weight.dim() == 4) { @@ -220,7 +232,11 @@ inline Tensor view_weight_2d(const Tensor& weight_, } template +<<<<<<< HEAD void slow_conv2d_update_output_frame( +======= +static void slow_conv2d_update_output_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorAccessor input, TensorAccessor output, TensorAccessor weight, @@ -480,7 +496,11 @@ void slow_conv2d_backward_weight_frame( } } +<<<<<<< HEAD void slow_conv2d_backward_weight_out_cpu_template( +======= +static void slow_conv2d_backward_weight_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& grad_weight, const Tensor& input, const Tensor& grad_output_, diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp index 894bf29456f78..e1c10ea7073a1 100644 --- a/aten/src/ATen/native/ConvolutionMM3d.cpp +++ b/aten/src/ATen/native/ConvolutionMM3d.cpp @@ -9,7 +9,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -28,7 +31,11 @@ namespace at::native { namespace { +<<<<<<< HEAD Tensor compute_columns3d( +======= +static Tensor compute_columns3d( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input_, IntArrayRef stride, IntArrayRef padding, @@ -108,7 +115,11 @@ Tensor compute_columns3d( return columns; } +<<<<<<< HEAD inline void slow_conv3d_shape_check( +======= +static inline void slow_conv3d_shape_check( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& grad_output, const Tensor& weight, @@ -175,6 +186,7 @@ inline void slow_conv3d_shape_check( const int64_t input_height = input.size(dim_height); const int64_t input_width = input.size(dim_width); +<<<<<<< HEAD constexpr int64_t MAX_SAFE_PAD = (1LL << 61); TORCH_CHECK_VALUE( @@ -192,6 +204,8 @@ inline void slow_conv3d_shape_check( "Padding depth too large: pad_depth=", pad_depth); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t exact_input_depth = input_depth + 2 * pad_depth; const int64_t exact_input_height = input_height + 2 * pad_height; const int64_t exact_input_width = input_width + 2 * pad_width; @@ -239,6 +253,7 @@ inline void slow_conv3d_shape_check( output_width, "). Output size is too small"); +<<<<<<< HEAD uint64_t kernel_product; TORCH_CHECK( !c10::mul_overflows(kernel_height, kernel_width, &kernel_product), @@ -247,6 +262,8 @@ inline void slow_conv3d_shape_check( ", kernel_width=", kernel_width); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (weight.defined()) { int64_t n_input_plane = weight.size(1); if (weight.dim() == 2) { @@ -273,7 +290,11 @@ inline void slow_conv3d_shape_check( } } +<<<<<<< HEAD Tensor view_weight_2d(const Tensor& weight_) { +======= +static Tensor view_weight_2d(const Tensor& weight_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor weight = weight_.contiguous(); if (weight.dim() == 5) { const int64_t s1 = weight.size(0); @@ -286,7 +307,11 @@ Tensor view_weight_2d(const Tensor& weight_) { } template +<<<<<<< HEAD void slow_conv3d_update_output_frame( +======= +static void slow_conv3d_update_output_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorAccessor input, TensorAccessor output, TensorAccessor weight, @@ -515,7 +540,11 @@ void slow_conv3d_backward_weight_frame( grad_weight.data(), ldc, grad_weight.stride(0) * n); } +<<<<<<< HEAD void slow_conv3d_backward_parameters_out_cpu_template( +======= +static void slow_conv3d_backward_parameters_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& grad_weight, const Tensor& input, const Tensor& grad_output, diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp index 8786257a8bd62..2e6a8551c8a70 100644 --- a/aten/src/ATen/native/ConvolutionTBC.cpp +++ b/aten/src/ATen/native/ConvolutionTBC.cpp @@ -52,7 +52,12 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in for (const auto k : c10::irange(kw)) { int iShift = std::max(0, static_cast(k - real_pad)); int oShift = std::max(0, static_cast(real_pad - k)); +<<<<<<< HEAD long t = std::min(ilen + real_pad - k, olen) - oShift; +======= + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + int t = std::min(ilen + real_pad - k, olen) - oShift; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Note: gemm assumes column-major matrices // input is l*m (row-major) // weight is m*r (row-major) diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 0b3ffda30577f..e6a9408f76156 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -1,5 +1,9 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -35,10 +39,15 @@ #endif #ifdef USE_FBGEMM +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") #include #include C10_DIAGNOSTIC_POP() +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif namespace { diff --git a/aten/src/ATen/native/DilatedMaxPool2d.cpp b/aten/src/ATen/native/DilatedMaxPool2d.cpp index 641e9f14dd711..ad084a913ef86 100644 --- a/aten/src/ATen/native/DilatedMaxPool2d.cpp +++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp @@ -54,7 +54,11 @@ bool ceil_mode) { TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); } else { +<<<<<<< HEAD TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); +======= + TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast, Contiguous"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /* sizes */ @@ -130,7 +134,11 @@ const Tensor& indices) { TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); } else { +<<<<<<< HEAD TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); +======= + TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast, Contiguous"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } /* sizes */ diff --git a/aten/src/ATen/native/DilatedMaxPool3d.cpp b/aten/src/ATen/native/DilatedMaxPool3d.cpp index 23d77cb210720..afa493eda70ad 100644 --- a/aten/src/ATen/native/DilatedMaxPool3d.cpp +++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp @@ -63,7 +63,11 @@ void max_pool3d_with_indices_out_cpu_template( TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5), "non-empty 4D or 5D (batch mode) tensor expected for input"); } else { +<<<<<<< HEAD TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast3d, Contiguous"); +======= + TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast3d, Contiguous"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } const int64_t nslices = input.size(-4); @@ -158,7 +162,11 @@ Tensor& max_pool3d_with_indices_backward_out_cpu_template( TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5), "non-empty 4D or 5D (batch mode) tensor expected for input"); } else { +<<<<<<< HEAD TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast3d, Contiguous"); +======= + TORCH_CHECK(false, "Unsupport memory format. Supports only ChannelsLast3d, Contiguous"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } const int64_t nslices = input.size(-4); diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h index 21a15b80c9c84..b5675ecb79d1a 100644 --- a/aten/src/ATen/native/DistributionTemplates.h +++ b/aten/src/ATen/native/DistributionTemplates.h @@ -28,13 +28,21 @@ namespace at::native::templates { // ==================================================== Random ======================================================== // The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`. +<<<<<<< HEAD // The current implementation of `random_` uses uint64_t arithmetic and casts the result to the target dtype(scalar_t). +======= +// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t). +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance: // // auto actual = torch::empty({3, 3}, torch::half); // actual.random_(0, 65504); // +<<<<<<< HEAD // If random's uint64_t arithmetic produces 65503 as a random value after casting to torch::half it becomes 65504 +======= +// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504 +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to` // moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to // the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index 5f34ed9d24c17..03f8acb547cd1 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -424,6 +424,7 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t */ Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional gen) { +<<<<<<< HEAD TORCH_CHECK_VALUE( at::isFloatingType(count.scalar_type()), "binomial only supports floating-point dtypes for count, got: ", @@ -432,6 +433,8 @@ Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor ret = at::zeros(count.sizes(), count.options()); auto iter = TensorIteratorConfig() .add_output(ret) diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index 755fe00b1f1c5..46ea0b67227d7 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -1,6 +1,9 @@ #pragma once +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -128,7 +131,11 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) { +<<<<<<< HEAD constexpr static scalar_t kTailValues[] = { +======= + const static scalar_t kTailValues[] = { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 0.0810614667953272, 0.0413406959554092, 0.0276779256849983, @@ -140,7 +147,11 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) { 0.00925546218271273, 0.00833056343336287 }; +<<<<<<< HEAD if (k < std::size(kTailValues)) { +======= + if (k <= 9) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return kTailValues[static_cast(k)]; } scalar_t kp1sq = (k + 1) * (k + 1); diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index e1076d0400f79..0f5c60e65dce0 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -14,10 +14,15 @@ #include #ifdef USE_FBGEMM +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") #include #include C10_DIAGNOSTIC_POP() +======= +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else #include #endif @@ -108,7 +113,11 @@ bool is_fast_path(const Tensor& src, const std::optional& scale, Tensor& // index_add (using add_indices as the index), without creating an intermediary // tensor to hold the selected embeddings template +<<<<<<< HEAD std::enable_if_t, void> +======= +static std::enable_if_t, void> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) index_select_add( const Tensor& select_indices, const Tensor& add_indices, @@ -494,7 +503,11 @@ index_select_add(const Tensor &select_indices, // mul (scaling by per_sample_weights) // index_add (using add_indices as the index) template +<<<<<<< HEAD std::enable_if_t, void> +======= +static std::enable_if_t, void> +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) index_select_scale_add( const Tensor& select_indices, const Tensor& add_indices, diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h index a344422204844..b86a3fae6d089 100644 --- a/aten/src/ATen/native/EmbeddingBag.h +++ b/aten/src/ATen/native/EmbeddingBag.h @@ -1,4 +1,7 @@ +<<<<<<< HEAD #pragma once +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp index 8e04a7490e879..a3b0323211c47 100644 --- a/aten/src/ATen/native/Fill.cpp +++ b/aten/src/ATen/native/Fill.cpp @@ -97,24 +97,42 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { int64_t nDims = self.dim(); TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); +<<<<<<< HEAD auto height = self.sym_size(0); auto width = self.sym_size(1); if (nDims > 2) { for (const auto i : c10::irange(1, nDims)) { if (self.sym_size(i) != height) { +======= + int64_t height = self.size(0); + int64_t width = self.size(1); + + if (nDims > 2) { + int64_t dim1 = height; + for (const auto i : c10::irange(1, nDims)) { + if (self.size(i) != dim1) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(false, "all dimensions of input must be of equal length"); } } } +<<<<<<< HEAD auto storage_offset = self.sym_storage_offset(); auto size = std::min(height, width); +======= + int64_t storage_offset = self.storage_offset(); + std::vector sizes; + std::vector strides; + int64_t size = std::min(height, width); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t stride = 0; for (const auto i : c10::irange(nDims)) { stride += self.stride(i); } +<<<<<<< HEAD std::vector strides{stride}; std::vector sizes{size}; @@ -129,6 +147,24 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { auto offset = self.stride(0) * (width + 1); auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset); +======= + strides.push_back(stride); + sizes.push_back(size); + + auto main_diag = self.as_strided(sizes, strides, storage_offset); + main_diag.fill_(fill_value); + + if (wrap && nDims == 2 && height > width + 1) { + std::vector wrap_sizes; + + int64_t step = width + 1; + int64_t wrap_size = ((self.numel() + step - 1) / step) - size; + wrap_sizes.push_back(wrap_size); + + int64_t offset = self.stride(0) * (width + 1); + + auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) wrap_diag.fill_(fill_value); } diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index cb437fb45ce21..4065c93f87bae 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -260,7 +260,10 @@ namespace at::native { check_foreach_api_restrictions(input, tensors1, tensors2); \ \ std::vector result; \ +<<<<<<< HEAD result.reserve(input.size()); \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(input.size())) { \ result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalar)); \ } \ @@ -289,7 +292,10 @@ namespace at::native { check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \ \ std::vector result; \ +<<<<<<< HEAD result.reserve(input.size()); \ +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(input.size())) { \ result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalars[i])); \ } \ @@ -419,7 +425,10 @@ std::vector foreach_tensor_ternary_lerp_slow( TensorList tensors3) { check_foreach_api_restrictions(tensors1, tensors2, tensors3); std::vector result; +<<<<<<< HEAD result.reserve(tensors1.size()); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(tensors1.size())) { result.emplace_back(tensors1[i].lerp(tensors2[i], tensors3[i])); } @@ -442,7 +451,10 @@ std::vector foreach_tensor_lerp_scalarlist_kernel_slow( at::ArrayRef scalars) { check_foreach_api_restrictions(tensors1, tensors2, scalars); std::vector result; +<<<<<<< HEAD result.reserve(tensors1.size()); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(tensors1.size())) { result.emplace_back(tensors1[i].lerp(tensors2[i], scalars[i])); } @@ -473,7 +485,10 @@ std::vector foreach_tensor_norm_slow( std::optional dtype) { check_foreach_api_restrictions(tensors); std::vector result; +<<<<<<< HEAD result.reserve(tensors.size()); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto& t : tensors) { result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype)); } @@ -483,7 +498,10 @@ std::vector foreach_tensor_norm_slow( std::vector foreach_tensor_max_slow(TensorList tensors) { check_foreach_api_restrictions(tensors); std::vector result; +<<<<<<< HEAD result.reserve(tensors.size()); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto& t : tensors) { result.emplace_back(at::max(t)); } diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index f0dce20a6eff4..a2bfe650a1219 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -22,7 +22,11 @@ namespace { // Check if tensor list has either a boolean tensor or a integer tensor inline bool has_integral_tensor(TensorList tensors, const bool includeBool) { return std::any_of( +<<<<<<< HEAD tensors.begin(), tensors.end(), [includeBool](const auto& t) { +======= + tensors.begin(), tensors.end(), [&includeBool](const auto& t) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::isIntegralType(t.scalar_type(), includeBool); }); } @@ -53,8 +57,13 @@ inline void check_foreach_api_restrictions( inline void check_foreach_api_restrictions( TensorList tensors1, TensorList tensors2) { +<<<<<<< HEAD check_foreach_api_restrictions(tensors1); check_foreach_api_restrictions(tensors2); +======= + TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor."); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( tensors1.size() == tensors2.size(), "Tensor lists must have the same number of tensors, got ", @@ -67,8 +76,26 @@ inline void check_foreach_api_restrictions( TensorList tensors1, TensorList tensors2, TensorList tensors3) { +<<<<<<< HEAD check_foreach_api_restrictions(tensors1, tensors2); check_foreach_api_restrictions(tensors1, tensors3); +======= + TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK( + tensors1.size() == tensors2.size(), + "Tensor lists must have the same number of tensors, got ", + tensors1.size(), + " and ", + tensors2.size()); + TORCH_CHECK( + tensors1.size() == tensors3.size(), + "Tensor lists must have the same number of tensors, got ", + tensors1.size(), + " and ", + tensors3.size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline void check_foreach_api_restrictions( @@ -77,7 +104,16 @@ inline void check_foreach_api_restrictions( TensorList tensors3, ArrayRef scalars) { check_foreach_api_restrictions(tensors1, tensors2, tensors3); +<<<<<<< HEAD check_foreach_api_restrictions(tensors1, scalars); +======= + TORCH_CHECK( + tensors1.size() == scalars.size(), + "Tensor list must have same number of elements as scalar list, got ", + tensors1.size(), + " and ", + scalars.size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline void check_foreach_api_restrictions( @@ -85,7 +121,16 @@ inline void check_foreach_api_restrictions( TensorList tensors2, ArrayRef scalars) { check_foreach_api_restrictions(tensors1, tensors2); +<<<<<<< HEAD check_foreach_api_restrictions(tensors1, scalars); +======= + TORCH_CHECK( + tensors1.size() == scalars.size(), + "Tensor list must have same number of elements as scalar list, got ", + tensors1.size(), + " and ", + scalars.size()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Helper function called in check_fast_path_restrictions to check whether all @@ -103,6 +148,7 @@ inline bool _check_tensors_share_device_and_dtype( tensor.is_non_overlapping_and_dense(); }; +<<<<<<< HEAD return std::all_of( tensorLists.cbegin(), tensorLists.cend(), @@ -110,6 +156,17 @@ inline bool _check_tensors_share_device_and_dtype( return std::all_of( tensorList.cbegin(), tensorList.cend(), is_tensor_okay); }); +======= + for (const auto& tensorList : tensorLists) { + for (const auto& tensor : tensorList) { + if (!is_tensor_okay(tensor)) { + return false; + } + } + } + + return true; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Helper function called in check_fast_path_restrictions to check if @@ -155,9 +212,17 @@ inline bool _check_tensors_do_type_promotion_with_scalars( bool does_op_promote_integer_inputs_to_float = false) { for (const auto i : c10::irange(tensorList.size())) { // For division, integer inputs will result in float. +<<<<<<< HEAD if (does_op_promote_integer_inputs_to_float && at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) { return false; +======= + if (does_op_promote_integer_inputs_to_float) { + if (at::isIntegralType( + tensorList[i].scalar_type(), /*includeBool*/ true)) { + return false; + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (!scalarList.empty()) { const auto& scalar = @@ -334,6 +399,7 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( } }), "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding"); +<<<<<<< HEAD grouped_tensors_with_indices.try_emplace( key, TensorsAndIndicesT{ @@ -362,6 +428,38 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( return indices; } }()}); +======= + if (!grouped_tensors_with_indices.count(key)) { + grouped_tensors_with_indices.insert( + {key, + TensorsAndIndicesT{ + [&]() -> nested_optional_tensorvec_t { + nested_optional_tensorvec_t nested_tensorvec; + nested_tensorvec.reserve(num_lists); + for (const auto& i : c10::irange(num_lists)) { + std::vector> tensors; + if (!nested_tensorlist[i].empty()) { + // NB: num_tensors is the max possible length for any of + // the inner lists of tensor references. Reserving the max + // trades memory for perf. This should not have significant + // impact. + tensors.reserve(num_tensors); + } + nested_tensorvec.emplace_back(tensors); + } + return nested_tensorvec; + }(), + [&]() -> IndicesT { + if (!with_indices) { + return {}; + } else { + IndicesT indices; + indices.reserve(num_tensors); + return indices; + } + }()}}); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto& list_index : c10::irange(num_lists)) { if (!nested_tensorlist[list_index].empty()) { grouped_tensors_with_indices[key].first[list_index].emplace_back( diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp index 664a612d0b137..228db8c60d65d 100644 --- a/aten/src/ATen/native/FractionalMaxPool2d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp @@ -130,7 +130,11 @@ namespace native { namespace { template +<<<<<<< HEAD void fractional_max_pool2d_out_single_batch_frame( +======= +static void fractional_max_pool2d_out_single_batch_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input, scalar_t* output, int64_t* indices, @@ -188,7 +192,11 @@ void fractional_max_pool2d_out_single_batch_frame( } template +<<<<<<< HEAD void fractional_max_pool2d_out_frame( +======= +static void fractional_max_pool2d_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input, scalar_t* output, int64_t* indices, @@ -220,7 +228,11 @@ void fractional_max_pool2d_out_frame( } template +<<<<<<< HEAD void fractional_max_pool2d_backward_out_single_batch_frame( +======= +static void fractional_max_pool2d_backward_out_single_batch_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, @@ -247,7 +259,11 @@ void fractional_max_pool2d_backward_out_single_batch_frame( } template +<<<<<<< HEAD void fractional_max_pool2d_backward_out_frame( +======= +static void fractional_max_pool2d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index 5ed3fdeab7651..538e99c8a77de 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -67,6 +67,7 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)( int64_t inputH = input_.size(heightDim); int64_t inputW = input_.size(widthDim); +<<<<<<< HEAD TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT), "fractional_max_pool3d_out(): pool time ", poolSizeT, " too large relative to input time ", inputT); @@ -74,6 +75,15 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)( "fractional_max_pool3d_out(): pool width ", poolSizeW, " too large relative to input width ", inputW); TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH), +======= + TORCH_CHECK(outputT + poolSizeT - 1 < inputT, + "fractional_max_pool3d_out(): pool time ", poolSizeT, + " too large relative to input time ", inputT); + TORCH_CHECK(outputW + poolSizeW - 1 < inputW, + "fractional_max_pool3d_out(): pool width ", poolSizeW, + " too large relative to input width ", inputW); + TORCH_CHECK(outputH + poolSizeH - 1 < inputH, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "fractional_max_pool3d_out(): pool height ", poolSizeH, " too large relative to input height ", inputH); @@ -99,7 +109,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD void fractional_max_pool3d_out_single_batch_frame( +======= +static void fractional_max_pool3d_out_single_batch_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input, scalar_t* output, int64_t* indices, @@ -169,7 +183,11 @@ void fractional_max_pool3d_out_single_batch_frame( } template +<<<<<<< HEAD void fractional_max_pool3d_out_frame( +======= +static void fractional_max_pool3d_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input, scalar_t* output, int64_t* indices, @@ -257,7 +275,11 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)( namespace { template +<<<<<<< HEAD void fractional_max_pool3d_backward_out_single_batch_frame( +======= +static void fractional_max_pool3d_backward_out_single_batch_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, @@ -287,7 +309,11 @@ void fractional_max_pool3d_backward_out_single_batch_frame( } template +<<<<<<< HEAD void fractional_max_pool3d_backward_out_frame( +======= +static void fractional_max_pool3d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 0ca8ec2a3a887..37454e5a0f260 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -86,7 +86,11 @@ namespace { for (const auto d : c10::irange(out_D)) { for (const auto h : c10::irange(out_H)) { for (const auto w : c10::irange(out_W)) { +<<<<<<< HEAD // get the corresponding input x, y, z coordinates from grid +======= + // get the corresponding input x, y, z co-ordinates from grid +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; scalar_t ix = *grid_ptr_NDHW; scalar_t iy = grid_ptr_NDHW[grid_sCoor]; @@ -285,7 +289,11 @@ namespace { for (const auto d : c10::irange(out_D)) { for (const auto h : c10::irange(out_H)) { for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) { +<<<<<<< HEAD // get the corresponding input x, y, z coordinates from grid +======= + // get the corresponding input x, y, z co-ordinates from grid +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; scalar_t ix = *grid_ptr_NDHW; scalar_t iy = grid_ptr_NDHW[grid_sCoor]; @@ -496,7 +504,11 @@ static Tensor _grid_sampler_2d_cpu_quantized( uint8_t* inp_ptr_N = inp_ptr + n * inp_sN; for (const auto h : c10::irange(out_H)) { for (const auto w : c10::irange(out_W)) { +<<<<<<< HEAD // get the corresponding input x, y, z coordinates from grid +======= + // get the corresponding input x, y, z co-ordinates from grid +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; float x = *grid_ptr_NHW; float y = grid_ptr_NHW[grid_sCoor]; @@ -599,7 +611,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; for (const auto h : c10::irange(out_H)) { for (const auto w : c10::irange(out_W)) { +<<<<<<< HEAD // get the corresponding input x, y, z coordinates from grid +======= + // get the corresponding input x, y, z co-ordinates from grid +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; scalar_t x = *grid_ptr_NHW; scalar_t y = grid_ptr_NHW[grid_sCoor]; @@ -771,7 +787,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; for (const auto h : c10::irange(out_H)) { for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { +<<<<<<< HEAD // get the corresponding input x, y coordinates from grid +======= + // get the corresponding input x, y co-ordinates from grid +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; scalar_t x = *grid_ptr_NHW; scalar_t y = grid_ptr_NHW[grid_sCoor]; diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h index 3388af7b8a0a7..56fd4d728c193 100644 --- a/aten/src/ATen/native/GridSamplerUtils.h +++ b/aten/src/ATen/native/GridSamplerUtils.h @@ -93,12 +93,15 @@ inline bool cond_cudnn_grid_sampler( const TensorBase& input, const TensorBase& grid ) { +<<<<<<< HEAD auto st = input.scalar_type(); if (!(st == kDouble || st == kFloat || st == kHalf)) return false; st = grid.scalar_type(); if (!(st == kDouble || st == kFloat || st == kHalf)) return false; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ( at::native::cudnn_is_acceptable(input) && at::native::cudnn_is_acceptable(grid) && diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp index 5919997cf5fe5..f30376a5a01eb 100644 --- a/aten/src/ATen/native/Histogram.cpp +++ b/aten/src/ATen/native/Histogram.cpp @@ -23,7 +23,10 @@ #include #endif +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -203,6 +206,7 @@ select_outer_bin_edges(const Tensor& input, std::optional> return std::make_pair(leftmost_edges, rightmost_edges); } +<<<<<<< HEAD /* Bin edges correction based on the precision representation. * To maintain the backward compatibility we take max(std::nextafter<>, +1) @@ -243,6 +247,8 @@ void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &r #undef UPDATE_WITH_LIMIT } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* histc's version of the logic for outermost bin edges. */ std::pair histc_select_outer_bin_edges(const Tensor& input, @@ -257,7 +263,12 @@ std::pair histc_select_outer_bin_edges(const Tensor& input, } if (leftmost_edge == rightmost_edge) { +<<<<<<< HEAD bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge); +======= + leftmost_edge -= 1; + rightmost_edge += 1; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) || diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp index acdcb2b27bda2..869ab96ea7327 100644 --- a/aten/src/ATen/native/Im2Col.cpp +++ b/aten/src/ATen/native/Im2Col.cpp @@ -19,7 +19,11 @@ namespace at::native { namespace { +<<<<<<< HEAD void im2col_out_cpu_template( +======= +static void im2col_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& output, const Tensor& input_, IntArrayRef kernel_size, diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp index 16d7c8670699a..4b623b5e36517 100644 --- a/aten/src/ATen/native/IndexingUtils.cpp +++ b/aten/src/ATen/native/IndexingUtils.cpp @@ -16,7 +16,12 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) { auto linearId = elements - 1; // NOTE: Assumes all strides are positive, which is true for now +<<<<<<< HEAD for (auto i = t.dim() - 1; i >= 0; --i) { +======= + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + for (int i = t.dim() - 1; i >= 0; --i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto curDimIndex = linearId % t.sym_size(i); auto curDimOffset = curDimIndex * t.sym_stride(i); offset += curDimOffset; diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h index 948a6b8320a4e..612f479ddeaa5 100644 --- a/aten/src/ATen/native/IndexingUtils.h +++ b/aten/src/ATen/native/IndexingUtils.h @@ -5,6 +5,7 @@ #include #include +<<<<<<< HEAD #ifndef AT_PER_OPERATOR_HEADERS #include #else @@ -12,6 +13,8 @@ #include #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::native { [[noreturn]] @@ -22,8 +25,12 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, [[maybe_unused]] static std::vector expandTensors( const Tensor& self, +<<<<<<< HEAD IOptTensorListRef indices, bool ensure_same_device = false) { +======= + IOptTensorListRef indices) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // If indices come in as ByteTensor or BoolTensor (masks), expand them into // the equivalent indexing by LongTensors std::vector result; @@ -46,6 +53,7 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, } } // Replace with nonzeros +<<<<<<< HEAD at::Tensor nonzero; if (ensure_same_device && index.device() != self.device()) { bool non_blocking = index.is_cpu() && self.device().is_cuda(); @@ -59,6 +67,12 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, } } else if (ensure_same_device && index.device() != self.device()) { result.emplace_back(index.to(self.device())); +======= + auto nonzero = index.nonzero(); + for (const auto j : c10::irange(index.dim())) { + result.emplace_back(nonzero.select(1, j)); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { result.emplace_back(index); } diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index a744da3bcad2e..3ad3024e59735 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -93,7 +93,11 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optionaldefined() && !input.is_xla()) { // Also hit the fused path for contiguous 3D input, if not using xla // backend. Reshaping/flattening has some performance implications on xla. +<<<<<<< HEAD bool is_contiguous = input.is_contiguous_or_false(); +======= + bool is_contiguous = definitely_contiguous(input.sym_sizes(), input.sym_strides(), input.sym_numel()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (is_contiguous && input_dim == 3) { return _flatten_nd_linear(input, weight, *bias); } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) { @@ -154,8 +158,13 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra Tensor left = left_; Tensor right = right_; for (const auto i : c10::irange(dim)) { +<<<<<<< HEAD auto sl = TORCH_GUARD_OR_TRUE(left.sym_size(i).sym_ne(1)); auto sr = TORCH_GUARD_OR_TRUE(right.sym_size(i).sym_ne(1)); +======= + auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1)); + auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (sum_dims[i]) { // first dimensions that will be summed over after multiplication if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match"); @@ -185,6 +194,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra // right: "lro, summed, ro" permuted with rpermutation and the three flattened // then the permuted output is a view of bmm(left, right) // finally, opermutation reverts the permutation to the original order of dimensions +<<<<<<< HEAD // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions. // However, if all dimensions from the right operand appear before those from the left // operand in the final output, we can swap the operands so that bmm directly produces @@ -196,6 +206,8 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra std::swap(lo, ro); std::swap(lo_size, ro_size); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size(); std::vector out_size; out_size.reserve(out_num_dim); @@ -488,7 +500,11 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr // Iterate over each dimension covered by ellipsis const auto ndim = operands[i].ndimension() - (static_cast(op_labels[i].size()) - 1); for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) { +<<<<<<< HEAD if (TORCH_GUARD_OR_TRUE(op.sym_size(dim).sym_ne(1))) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Update ellipsis size TORCH_SYM_CHECK( ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))), @@ -507,7 +523,11 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr permutation[ell_index + j] = dim++; } } else if (permutation[label_perm_index[s]] == -1) { +<<<<<<< HEAD if (TORCH_GUARD_OR_TRUE(op.sym_size(dim).sym_ne(1))) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Update subscript TORCH_SYM_CHECK( label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))), @@ -585,6 +605,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr SmallVector a_dims_to_sum; SmallVector b_dims_to_sum; for (auto dim = out_num_dim; dim < perm_index; ++dim) { +<<<<<<< HEAD auto sa = TORCH_GUARD_OR_TRUE(a.sym_size(dim).sym_ne(1)); auto sb = TORCH_GUARD_OR_TRUE(b.sym_size(dim).sym_ne(1)); @@ -592,15 +613,26 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr // if both a and b are equal, or we can't tell that its a broadcast for sure, // we assume non-broadcast. TORCH_SYM_CHECK(a.sym_size(dim).sym_eq(b.sym_size(dim)), "non-broadcast dimensions must match"); +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1)) + && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (--dim_counts[dim] == 1) { sum_dims.push_back(dim); dim_counts[dim] = 0; } } else if (dim_counts[dim] == 1) { +<<<<<<< HEAD if (sa) { a_dims_to_sum.push_back(dim); dim_counts[dim] = 0; } else if (sb) { +======= + if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) { + a_dims_to_sum.push_back(dim); + dim_counts[dim] = 0; + } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) b_dims_to_sum.push_back(dim); dim_counts[dim] = 0; } diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index c07c7a5ac6e07..8bb1f117cdad0 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1360,8 +1360,11 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { #endif +<<<<<<< HEAD #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() // Used by default on x86 platforms and on AArch64+ACL +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline int64_t get_mkldnn_matmul_min_dim() { static auto value = [&] { const int64_t default_min_dim = [&] { @@ -1395,7 +1398,12 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) { const int64_t min_size = get_mkldnn_matmul_min_size(); return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size; } +<<<<<<< HEAD #endif +======= + + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static void addmm_impl_cpu_( Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) { TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2); @@ -1771,8 +1779,12 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) || (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1])); }; +<<<<<<< HEAD #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { try { @@ -1783,7 +1795,11 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens at::globalContext().setUserEnabledMkldnn(false); } } +<<<<<<< HEAD #endif +======= + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (contraction_size * res_rows * res_cols < 400) { if (is_bmm_out) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] { @@ -2801,7 +2817,10 @@ Tensor matrix_exp(const Tensor& a) { // TODO This should be deprecated in favor of linalg_matrix_exp_differential // in FunctionsManual.cpp Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) { +<<<<<<< HEAD squareCheckInputs(self, "matrix_exp_backward"); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) NoTF32Guard disable_tf32; return backward_analytic_function_of_a_matrix( self, grad, @@ -3620,7 +3639,11 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result) try { mkldnn_matmul_i8i8i32(self, mat2, result); dispatched = true; +<<<<<<< HEAD } catch ([[maybe_unused]] const std::exception& e) { +======= + } catch (const std::exception& e) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what()); } } diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 257863573d3a8..f8db79e80d4cc 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -148,7 +148,11 @@ inline void checkInputsSolver(const Tensor& A, inline bool is_row_or_column_contiguous(const Tensor& t) { // This could be made more general, similar to how it's checked in matmul, which would allow to +<<<<<<< HEAD // elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. +======= + // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We choose to be conservative for simplicity return t.is_contiguous() || t.transpose(-2, -1).is_contiguous(); } diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 40d79d97c0cdf..c379fbd6f9e24 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -61,7 +61,11 @@ constexpr float EPSILON = 1e-12; namespace { +<<<<<<< HEAD inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { +======= + static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (reduction == at::Reduction::Mean) { return unreduced.mean(); } else if (reduction == at::Reduction::Sum) { @@ -127,9 +131,12 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out) TORCH_IMPL_FUNC(mse_loss_out) (const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) { +<<<<<<< HEAD TORCH_CHECK(input.device() == target.device(), "Expected all tensors to be on the same device, but found at least two devices, ", input.device(), " and ", target.device(), "!"); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (reduction != Reduction::None) { Tensor loss; auto iter = TensorIterator::borrowing_binary_op(loss, input, target); diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index 2e2bc5542b51b..65d4826cf2793 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -44,7 +44,11 @@ namespace { // this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done template +<<<<<<< HEAD inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { +======= +static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (idx % 2 == 0) { return BLANK; } else { diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp index b524d277cd0aa..fc725045f4b99 100644 --- a/aten/src/ATen/native/LossMultiLabelMargin.cpp +++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp @@ -58,7 +58,11 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu( } template +<<<<<<< HEAD void multilabel_margin_loss_forward_out_frame( +======= +static void multilabel_margin_loss_forward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input_contiguous, const Tensor& target_contiguous, Tensor& output, @@ -108,7 +112,11 @@ void multilabel_margin_loss_forward_out_frame( } } +<<<<<<< HEAD void multilabel_margin_loss_forward_out_cpu_template( +======= +static void multilabel_margin_loss_forward_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& target, Tensor& output, @@ -153,7 +161,11 @@ void multilabel_margin_loss_forward_out_cpu_template( } template +<<<<<<< HEAD void multilabel_margin_loss_backward_out_frame( +======= +static void multilabel_margin_loss_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& grad_input, const Tensor& grad_output, const Tensor& input_contiguous, @@ -222,7 +234,11 @@ void multilabel_margin_loss_backward_out_frame( } } +<<<<<<< HEAD void multilabel_margin_loss_backward_out_cpu_template( +======= +static void multilabel_margin_loss_backward_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& grad_input, const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp index f9dc074a6983b..099c353ab5968 100644 --- a/aten/src/ATen/native/LossMultiMargin.cpp +++ b/aten/src/ATen/native/LossMultiMargin.cpp @@ -57,7 +57,11 @@ inline int64_t target_index_checked( } template +<<<<<<< HEAD inline void multi_margin_loss_cpu_kernel( +======= +static inline void multi_margin_loss_cpu_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& output, const scalar_t* input_data, const int64_t* target_data, @@ -148,7 +152,11 @@ void multi_margin_loss_out_cpu_template( } template +<<<<<<< HEAD void multi_margin_loss_backward_cpu_kernel( +======= +static void multi_margin_loss_backward_cpu_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* grad_input_data, const Tensor& grad_output, const scalar_t* input_data, diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 576f56986988b..52eda70660bb7 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -47,6 +47,7 @@ TORCH_META_FUNC(nll_loss_forward) TORCH_CHECK( target.dim() <= 1, "0D or 1D target tensor expected, multi-target not supported"); +<<<<<<< HEAD if (self.dim() == 1 && target.dim() == 1) { TORCH_CHECK_VALUE( target.size(0) == 1, @@ -55,6 +56,12 @@ TORCH_META_FUNC(nll_loss_forward) } TORCH_CHECK( self.dim() == 1 || (self.size(0) == target.size(0)), +======= + + auto no_batch_dim = self.dim() == 1 && target.dim() == 0; + TORCH_CHECK( + no_batch_dim || (self.size(0) == target.size(0)), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "size mismatch (got input: ", self.sizes(), ", target: ", @@ -159,7 +166,11 @@ inline scalar_t* optional_data(const Tensor& source) { } template +<<<<<<< HEAD void nll_loss_out_frame( +======= +static void nll_loss_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& output, const Tensor& total_weight, const Tensor& input, @@ -338,7 +349,11 @@ void nll_loss_forward_out_cpu_template( } template +<<<<<<< HEAD void nll_loss_backward_out_frame( +======= +static void nll_loss_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index 7bea90cbd5274..e668eb62302a8 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -99,7 +99,11 @@ inline void check_gradout_shape_nll_loss2d( template +<<<<<<< HEAD void nll_loss2d_forward_out_frame( +======= +static void nll_loss2d_forward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& output, Tensor& total_weight, const Tensor& input, @@ -280,7 +284,11 @@ void nll_loss2d_forward_out_cpu_template( } template +<<<<<<< HEAD void nll_loss2d_backward_out_frame( +======= +static void nll_loss2d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& grad_input, const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h index 4677542706f6b..49f983aeb9cae 100644 --- a/aten/src/ATen/native/Math.h +++ b/aten/src/ATen/native/Math.h @@ -581,7 +581,11 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M, template static scalar_t lanczos_sum_expg_scaled(scalar_t x) { // lanczos approximation +<<<<<<< HEAD static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = { +======= + static const scalar_t lanczos_sum_expg_scaled_num[13] = { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 0.006061842346248906525783753964555936883222, 0.5098416655656676188125178644804694509993, 19.51992788247617482847860966235652136208, @@ -596,7 +600,11 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) { 103794043.1163445451906271053616070238554, 56906521.91347156388090791033559122686859 }; +<<<<<<< HEAD static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = { +======= + static const scalar_t lanczos_sum_expg_scaled_denom[13] = { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) 1., 66., 1925., @@ -712,7 +720,11 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) { template static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) { // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1] +<<<<<<< HEAD static constexpr scalar_t d[25][25] = +======= + static const scalar_t d[25][25] = +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, @@ -1068,7 +1080,11 @@ inline scalar_t calc_igammac(scalar_t a, scalar_t x) { * result at the boundary * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for * Large Parameter (see DLMF 8.12.4 [igam1]) +<<<<<<< HEAD * - if x > 1.1 and x < a, using the subtraction from the regularized lower +======= + * - if x > 1.1 and x < a, using the substraction from the regularized lower +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * incomplete gamma * - otherwise, calculate the series from [igam2] eq (5) */ @@ -1148,7 +1164,11 @@ scalar_t calc_igamma(scalar_t a, scalar_t x) { * result at the boundary * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for * Large Parameter (see DLMF 8.12.3 [igam1]) +<<<<<<< HEAD * - if x > 1 and x > a, using the subtraction from the regularized upper +======= + * - if x > 1 and x > a, using the substraction from the regularized upper +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) * incomplete gamma * - otherwise, calculate the series from [igam2] eq (4) */ @@ -1730,7 +1750,11 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) { with the usual checks for overflow etcetera. Performance-wise, it seems to be substantially faster than either +<<<<<<< HEAD the SLATEC DERFC function [or an erfcx function derived there from] +======= + the SLATEC DERFC function [or an erfcx function derived therefrom] +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) or Cody's CALERF function (from netlib.org/specfun), while retaining near machine precision in accuracy. */ @@ -2862,7 +2886,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) { T q = x; T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x) * q - p; p = q; q = r; @@ -2910,7 +2938,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) { T q = x + x; T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x) * q - p; p = q; q = r; @@ -2966,7 +2998,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) { T q = x + x - T(1.0); T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x) * q - p; p = q; q = r; @@ -3026,7 +3062,11 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) { T q = x + x + T(1.0); T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x) * q - p; p = q; q = r; @@ -3150,7 +3190,11 @@ inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) { T q = T(1.0) - x; T r; +<<<<<<< HEAD for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { +======= + for (int64_t k = 1; k < n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1); p = q; q = r; @@ -3190,7 +3234,11 @@ inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) { T q = x; T r; +<<<<<<< HEAD for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { +======= + for (int64_t k = 1; k < n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = ((k + k + 1) * x * q - k * p) / (k + 1); p = q; q = r; @@ -3733,7 +3781,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) T q = x + x - T(1.0); T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; p = q; q = r; @@ -3785,7 +3837,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) T q = x + x - T(1.0) + (x + x - T(1.0)); T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; p = q; q = r; @@ -3841,7 +3897,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; p = q; q = r; @@ -3897,7 +3957,11 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); T r; +<<<<<<< HEAD for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { +======= + for (int64_t k = 2; k <= n; k++) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; p = q; q = r; diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp index f91b892efec21..9681ca3b3454a 100644 --- a/aten/src/ATen/native/MaxUnpooling.cpp +++ b/aten/src/ATen/native/MaxUnpooling.cpp @@ -23,6 +23,11 @@ Tensor& max_unpooling2d_forward_out_cpu( // Nondeterministic with duplicate indices at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); +<<<<<<< HEAD +======= + auto oheight = output_size[0]; + auto owidth = output_size[1]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( indices_.scalar_type() == at::ScalarType::Long, "elements in indices should be type int64 but got: ", indices_.scalar_type()); @@ -43,9 +48,12 @@ Tensor& max_unpooling2d_forward_out_cpu( self_.sizes(), " with dimension ", i , " being empty."); } +<<<<<<< HEAD auto oheight = output_size[0]; auto owidth = output_size[1]; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto memory_format = self_.suggest_memory_format(); auto self = self_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp index 08c42a0d470c7..d71507f8734ec 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp @@ -24,7 +24,11 @@ namespace at { namespace { +<<<<<<< HEAD inline void slow_conv_transpose2d_shape_check( +======= +static inline void slow_conv_transpose2d_shape_check( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& grad_output, const Tensor& weight, @@ -386,7 +390,11 @@ void slow_conv_transpose2d_out_cpu_template( } } +<<<<<<< HEAD void slow_conv_transpose2d_backward_out_cpu_template( +======= +static void slow_conv_transpose2d_backward_out_cpu_template( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input_, const Tensor& grad_output_, Tensor& grad_input, diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp index 469269ab07dfb..876e8700b7697 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp @@ -22,7 +22,11 @@ namespace at::native { namespace { +<<<<<<< HEAD inline void slow_conv_transpose3d_shape_check( +======= +static inline void slow_conv_transpose3d_shape_check( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, const Tensor& grad_output, const Tensor& weight, diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 0914ede84034c..a6d05cb58c9b5 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -61,9 +61,14 @@ #include #include #include +<<<<<<< HEAD #include static constexpr int MIOPEN_DIM_MAX = 5; +======= + +static const int MIOPEN_DIM_MAX = 5; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::meta { @@ -93,7 +98,11 @@ namespace { arg_name, " should contain ", expected, " elements not ", actual); } +<<<<<<< HEAD inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) { +======= + static inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (t.defined()) { return t.repeat_symint(repeat); } @@ -495,8 +504,11 @@ static std::tuple batch_norm_backward_cpu_template( return std::make_tuple(grad_input, grad_weight, grad_bias); } +<<<<<<< HEAD static bool PYTORCH_MIOPEN_EXTRA_LOGGING = c10::utils::check_env("PYTORCH_MIOPEN_EXTRA_LOGGING").value_or(false); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) BatchNormBackend _select_batch_norm_backend( const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double eps) { @@ -504,6 +516,7 @@ BatchNormBackend _select_batch_norm_backend( auto& ctx = at::globalContext(); bool cudnn_enabled = ctx.userEnabledCuDNN(); +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _select_batch_norm_backend" @@ -517,6 +530,8 @@ BatchNormBackend _select_batch_norm_backend( << " input.dim=" << input.dim() << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ( input.is_cuda() && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16 @@ -537,6 +552,7 @@ BatchNormBackend _select_batch_norm_backend( } // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen +<<<<<<< HEAD // See https://github.com/pytorch/pytorch/issues/64427. // non static variable is used to be able to change environment variable in runtime for testing // enabled by default for ROCm >= 7.0.0 with miopen 3.5 @@ -544,6 +560,12 @@ BatchNormBackend _select_batch_norm_backend( bool is_miopen_3_4 = miopen_version >= 30400; // ROCm 6.4 bool is_miopen_3_5 = miopen_version >= 30500; // ROCm 7.0 bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(is_miopen_3_5); +======= + // See #64427 + // non static variable is used to be able to change environment variable in runtime for testing + // enabled by default for ROCm >= 7.0.0 + bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ( detail::getCUDAHooks().compiledWithMIOpen() @@ -552,15 +574,30 @@ BatchNormBackend _select_batch_norm_backend( && input.dim() <= MIOPEN_DIM_MAX && input.dim() >= 3 && input.scalar_type() != at::kDouble +<<<<<<< HEAD && (is_miopen_3_4 || input.scalar_type() != at::kBFloat16) +======= +#if (defined(USE_ROCM) && ROCM_VERSION < 60400) + && (input.scalar_type() != at::kBFloat16) +#endif + && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input && weight.defined() && bias.defined() && ((running_mean.defined() && running_var.defined()) || (!running_mean.defined() && !running_var.defined() && training)) && (input.suggest_memory_format() == MemoryFormat::Contiguous +<<<<<<< HEAD || (is_miopen_3_5 && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM && (input.suggest_memory_format() == MemoryFormat::ChannelsLast || input.suggest_memory_format() == MemoryFormat::ChannelsLast3d))) +======= +#if (defined(USE_ROCM) && ROCM_VERSION >= 60500) + || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) + || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) +#endif + ) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ) { return BatchNormBackend::Miopen; } @@ -578,6 +615,7 @@ std::tuple _batch_norm_impl_index( const Tensor& input, const std::optional& weight_opt /* optional */, const std::optional& bias_opt /* optional */, const std::optional& running_mean_opt /* optional */, const std::optional& running_var_opt /* optional */, bool training, double momentum, double eps, bool cudnn_enabled) { // See [Note: hacky wrapper removal for optional tensor] +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index" @@ -592,6 +630,8 @@ std::tuple _batch_norm_impl_index( << " cudnn_enabled=" << cudnn_enabled << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; const Tensor& bias = bias_opt.value_or(Tensor()); @@ -651,6 +691,7 @@ std::tuple _batch_norm_impl_index( Tensor reserve = at::empty({0}, input.options().dtype(kByte)); +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (use_miopen)" @@ -674,6 +715,12 @@ std::tuple _batch_norm_impl_index( input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(), +======= + if (backend == BatchNormBackend::Miopen) { + return std::tuple_cat( + at::miopen_batch_norm( + input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) running_mean.defined() ? running_mean.contiguous() : running_mean, running_var.defined() ? running_var.contiguous() : running_var, training, momentum, eps), @@ -681,9 +728,12 @@ std::tuple _batch_norm_impl_index( std::make_tuple(2)); } +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling native_batch_norm)" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::tuple_cat( at::native_batch_norm( input, weight, bias, running_mean, running_var, training, momentum, eps), @@ -696,8 +746,11 @@ std::tuple _batch_norm_impl_index_backward( const Tensor& input, const Tensor& grad_output, const std::optional& weight_opt /* optional */, const std::optional& running_mean_opt /* optional */, const std::optional& running_var_opt /* optional */, const std::optional& save_mean_opt /* optional */, const std::optional& save_var_transform_opt /* optional */, bool train, double epsilon, std::array output_mask, const Tensor &reservedSpace) { // See [Note: hacky wrapper removal for optional tensor] +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; const Tensor& running_mean = running_mean_opt.value_or(Tensor()); @@ -728,16 +781,22 @@ std::tuple _batch_norm_impl_index_backward( // backward in inference mode is not supported in cudnn, fallback to native if (impl_index == 0 || (!train)) { +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling native_batch_norm_backward)" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask); } else if (impl_index == 1) { // TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC // format conversion is done inside cudnn_batch_norm_backward instead return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon, reservedSpace); } else if (impl_index == 2) { +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling miopen_batch_norm_backward)" << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon); } TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index); @@ -748,6 +807,7 @@ Tensor batch_norm( const Tensor& input, const std::optional& weight_opt, const std::optional& bias_opt, const std::optional& running_mean_opt, const std::optional& running_var_opt, bool training, double momentum, double eps, bool cudnn_enabled) { +<<<<<<< HEAD if (PYTORCH_MIOPEN_EXTRA_LOGGING) std :: cout @@ -763,11 +823,16 @@ Tensor batch_norm( << " cudnn_enabled=" << cudnn_enabled << std::endl; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& weight = weight_opt.value_or(Tensor()); const Tensor& bias = bias_opt.value_or(Tensor()); const Tensor& running_mean = running_mean_opt.value_or(Tensor()); const Tensor& running_var = running_var_opt.value_or(Tensor()); +<<<<<<< HEAD +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled)); // TODO: switch to the new stack after the 2 week FC window diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index 2a20f95f10c20..4e1efae3926a7 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -1,6 +1,9 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -25,6 +28,7 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { if (num_classes == -1) { num_classes = self.max().item().toLong() + 1; } +<<<<<<< HEAD { // If `self` is a DTensor, then allow implicit replication // of the `index` Tensor. @@ -44,6 +48,22 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { } else { shape.emplace_back(num_classes); return at::empty_symint(shape, self.options()); +======= + at::Tensor index = at::arange(num_classes, self.options()); + return at::eq(self.unsqueeze(-1), index).to(kLong); + } + + auto shape = self.sizes().vec(); + + // empty tensor could be converted to one hot representation, + // but shape inference is not possible. + if (self.numel() == 0) { + if (num_classes <= 0) { + TORCH_CHECK(false, "Can not infer total number of classes from empty tensor."); + } else { + shape.push_back(num_classes); + return at::empty(shape, self.options()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } @@ -66,8 +86,13 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { } } +<<<<<<< HEAD shape.emplace_back(num_classes); Tensor ret = at::zeros_symint(shape, self.options()); +======= + shape.push_back(num_classes); + Tensor ret = at::zeros(shape, self.options()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ret.scatter_(-1, self.unsqueeze(-1), 1); return ret; } diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index 986447bab6141..bd23379a39598 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -70,10 +70,17 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) new_shape.emplace_back(input_sizes[i]); } +<<<<<<< HEAD for (const auto i : c10::irange(l_pad)) { auto pad_idx = pad.size() - ((i + 1) * 2); auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", +======= + for (const auto i : c10::irange((size_t)l_pad)) { + auto pad_idx = pad.size() - ((i + 1) * 2); + auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; + TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, " "which is invalid. Check dimension ", l_diff + i, " of your input."); new_shape.emplace_back(new_dim); @@ -240,6 +247,7 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod default: {} } } +<<<<<<< HEAD std::ostringstream error_msg; error_msg << "Padding size " << pad.size() << " is not supported for " << input_dim << "D input tensor.\n"; @@ -249,6 +257,10 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod error_msg << " - 4D or 5D input: padding size = 6 (pads last 3 dimensions)"; C10_THROW_ERROR(NotImplementedError, error_msg.str()); +======= + C10_THROW_ERROR(NotImplementedError, + "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, std::string_view mode, std::optional value) { diff --git a/aten/src/ATen/native/PixelShuffle.h b/aten/src/ATen/native/PixelShuffle.h index 46ffa7ddb23c3..c61673770ac54 100644 --- a/aten/src/ATen/native/PixelShuffle.h +++ b/aten/src/ATen/native/PixelShuffle.h @@ -11,8 +11,11 @@ inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_facto "pixel_shuffle expects a positive upscale_factor, but got ", upscale_factor); int64_t c = self.size(-3); +<<<<<<< HEAD TORCH_CHECK_VALUE(upscale_factor <= std::numeric_limits::max() / upscale_factor, "upscale factor is too large, (upscale_factor)^2 overflowed: upscale_factor=", upscale_factor); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t upscale_factor_squared = upscale_factor * upscale_factor; TORCH_CHECK(c % upscale_factor_squared == 0, "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of " diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 7f335de04b90a..204b3afe97ad3 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -17,7 +17,11 @@ using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& g DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel) DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel) +<<<<<<< HEAD // average pooling has same signature for forward and backward +======= +// averge pooling has same signature for forward and backward +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, std::optional divisor_override); using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, @@ -26,7 +30,11 @@ using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel) DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel) +<<<<<<< HEAD // average pooling has same signature for forward and backward +======= +// averge pooling has same signature for forward and backward +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD, int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp index 3a3ab3794c3cd..fc41897456caf 100644 --- a/aten/src/ATen/native/QuantizedLinear.cpp +++ b/aten/src/ATen/native/QuantizedLinear.cpp @@ -25,11 +25,17 @@ #include #ifdef USE_FBGEMM +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") #include #include #include C10_DIAGNOSTIC_POP() +======= +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif // USE_FBGEMM namespace caffe2 { @@ -68,6 +74,10 @@ Tensor fbgemm_linear_int8_weight_fp32_activation( const float* input_ptr = input_contig.const_data_ptr(); TORCH_CHECK(input.dim() >= 2); +<<<<<<< HEAD +======= + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const int64_t M = size_to_dim_(input.dim() - 1, input.sizes()); const int64_t K = input.size(input.dim() - 1); TORCH_CHECK(weight.dim() == 2); @@ -410,8 +420,12 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) { Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, +<<<<<<< HEAD const std::optional& bias, at::Tensor& output) { +======= + const Tensor& bias) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated " "and will be removed in a future PyTorch release.") @@ -432,15 +446,25 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows()) TORCH_CHECK(input.dim() >= 2); +<<<<<<< HEAD +======= + TORCH_CHECK(bias.dim() == 1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) const int64_t M = size_to_dim_(input.dim() - 1, input.sizes()); const int64_t N = packed_weight_fp16.numCols(); +<<<<<<< HEAD std::vector output_size = input.sizes().vec(); output_size.back() = N; // Resize output Tensor output.resize_(output_size); +======= + std::vector output_size = input.sizes().vec(); + output_size.back() = N; + Tensor output = at::empty(output_size, input.options().dtype(at::kFloat)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Call the fp16 gemm interface fbgemm::cblas_gemm_compute( @@ -452,16 +476,21 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( output.data_ptr()); // Add bias term +<<<<<<< HEAD c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias); const Tensor& bias_ = *bias_maybe_owned; if (bias_.defined()) { TORCH_CHECK(bias_.dim() == 1); output.add_(bias_); } +======= + output.add_(bias); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return output; } +<<<<<<< HEAD Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, @@ -470,6 +499,8 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, @@ -478,6 +509,7 @@ Tensor fbgemm_linear_fp16_weight( input, packed_weight, bias); } +<<<<<<< HEAD Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, @@ -487,6 +519,8 @@ Tensor fbgemm_linear_fp16_weight( input, packed_weight, bias, output); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #else // USE_FBGEMM Tensor fbgemm_linear_int8_weight_fp32_activation( @@ -576,8 +610,12 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) { Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, +<<<<<<< HEAD const std::optional& bias, at::Tensor& output) { +======= + const Tensor& bias) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated " "and will be removed in a future PyTorch release.") @@ -588,6 +626,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( false, "This PyTorch installation was not built with FBGEMM operators"); } +<<<<<<< HEAD Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, @@ -617,6 +656,8 @@ Tensor fbgemm_linear_fp16_weight( false, "This PyTorch installation was not built with FBGEMM operators"); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index 75b30320b0276..8168097d942f9 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -108,6 +108,7 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) { return false; } +<<<<<<< HEAD bool use_cudnn(const Tensor& t) { bool acceptable = at::cudnn_is_acceptable(t); auto st = t.scalar_type(); @@ -115,6 +116,8 @@ bool use_cudnn(const Tensor& t) { return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template using pair_of = std::pair; @@ -538,7 +541,11 @@ c10::intrusive_ptr make_quantized_cell_params_fp16( std::move(w_ih_packed), std::move(w_hh_packed)); } +<<<<<<< HEAD std::unordered_map< +======= +static std::unordered_map< +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::string, c10::intrusive_ptr (*)(CellParamsSerializationType)> cell_params_deserializers = { @@ -578,7 +585,11 @@ struct QRNNCellParamsWrapper { // Gathers every two elements of a vector in a vector of pairs template +<<<<<<< HEAD std::vector> pair_vec(const std::vector& vals) { +======= +static std::vector> pair_vec(const std::vector& vals) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN"); std::vector> result; result.reserve(vals.size() / 2); @@ -590,7 +601,11 @@ std::vector> pair_vec(const std::vector& vals) { // Flattens a vector of pairs template +<<<<<<< HEAD std::vector unpair_vec(std::vector>&& vals) { +======= +static std::vector unpair_vec(std::vector>&& vals) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::vector result; result.reserve(vals.size() * 2); for (const auto i : c10::irange(vals.size())) { @@ -601,7 +616,11 @@ std::vector unpair_vec(std::vector>&& vals) { } // Parses a flat list of parameter tensors into a list of CellParams +<<<<<<< HEAD std::vector gather_params(TensorList params, bool has_biases, bool has_projections = false) { +======= +static std::vector gather_params(TensorList params, bool has_biases, bool has_projections = false) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static at::Tensor undefined; std::vector result; if (has_biases) { @@ -1207,7 +1226,11 @@ std::tuple _thnn_fused_lstm_cell_backwar bool train, \ bool bidirectional, \ bool batch_first) { \ +<<<<<<< HEAD if (use_cudnn(_input)) { \ +======= + if (at::cudnn_is_acceptable(_input)) { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor output, hy; \ NAME##_cudnn_stub( \ _input.device().type(), \ @@ -1269,7 +1292,11 @@ std::tuple _thnn_fused_lstm_cell_backwar double dropout_p, \ bool train, \ bool bidirectional) { \ +<<<<<<< HEAD if (use_cudnn(data)) { \ +======= + if (at::cudnn_is_acceptable(data)) { \ +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor output, hy; \ NAME##_packed_cudnn_stub( \ data.device().type(), \ @@ -1437,7 +1464,11 @@ std::tuple lstm( TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); +<<<<<<< HEAD if (use_cudnn(_input)) { +======= + if (at::cudnn_is_acceptable(_input)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor output, hy, cy; lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases, num_layers, dropout_p, train, bidirectional, batch_first); @@ -1498,7 +1529,11 @@ std::tuple lstm( TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional) { TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); +<<<<<<< HEAD if (use_cudnn(data)) { +======= + if (at::cudnn_is_acceptable(data)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor output, hy, cy; lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx, _params, has_biases, num_layers, dropout_p, train, bidirectional); @@ -1894,10 +1929,17 @@ static DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple namespace { +<<<<<<< HEAD [[maybe_unused]] auto ensure_linear_params_registered = register_linear_params(); auto cell_params_base_registry = +======= +[[maybe_unused]] static auto ensure_linear_params_registered = + register_linear_params(); + +static auto cell_params_base_registry = +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) torch::selective_class_("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase")) .def_pickle( [](const c10::intrusive_ptr& self) diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h index fd62b8e01329b..3c05e4e51db43 100644 --- a/aten/src/ATen/native/RangeUtils.h +++ b/aten/src/ATen/native/RangeUtils.h @@ -47,7 +47,11 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar int64_t sgn = (xstep > 0) - (xstep < 0); size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); } else { +<<<<<<< HEAD size_d = std::ceil((end.to() - start.to()) +======= + size_d = std::ceil(static_cast(end.to() - start.to()) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) / step.to()); } diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index db046428bb683..0a5a154d59cd5 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -71,8 +71,11 @@ #include #include #include +<<<<<<< HEAD #include #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -220,8 +223,11 @@ static void check_argmax_argmin( const char* name, const Tensor& self, const std::optional& dim) { +<<<<<<< HEAD TORCH_CHECK(!self.is_complex(), name, ": does not support complex input"); TORCH_CHECK(!(self.scalar_type() == kBool), name, ": does not support bool input"); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (dim.has_value()) { auto dim_ = maybe_wrap_dim(dim.value(), self.dim()); native::zero_numel_check_dims(self, dim_, name); @@ -402,6 +408,7 @@ TORCH_META_FUNC(amin) resize_reduction(*this, self, dim, keepdim, out_dtype); } +<<<<<<< HEAD TORCH_META_FUNC(hash_tensor) (const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode) { auto maybe_result = maybe_get_output(); @@ -415,6 +422,8 @@ TORCH_META_FUNC(hash_tensor) } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace at::meta namespace at::native { @@ -458,7 +467,10 @@ DEFINE_DISPATCH(argmin_stub); DEFINE_DISPATCH(cumsum_stub); DEFINE_DISPATCH(cumprod_stub); DEFINE_DISPATCH(logcumsumexp_stub); +<<<<<<< HEAD DEFINE_DISPATCH(xor_sum_stub); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor _logcumsumexp_cpu(const Tensor& self, int64_t dim) { Tensor result = at::empty_like(self, MemoryFormat::Contiguous); @@ -1469,7 +1481,11 @@ Tensor& nanmean_out( "nanmean(): expected input to have floating point or complex dtype but got ", self.scalar_type()); const auto factor = at::native::isnan(self).logical_not_().sum(dim, keepdim); +<<<<<<< HEAD at::nansum_out(result, self, dim, keepdim, opt_dtype).div_(factor); +======= + at::native::nansum_out(self, dim, keepdim, opt_dtype, result).div_(factor); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return result; } @@ -2251,6 +2267,7 @@ Tensor dist(const Tensor &self, const Tensor& other, const Scalar& p){ return at::norm(self - other, p); } +<<<<<<< HEAD enum class HashMode { XOR_SUM = 0 }; TORCH_IMPL_FUNC(hash_tensor_out) (const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode, const Tensor& result) { @@ -2269,6 +2286,8 @@ TORCH_IMPL_FUNC(hash_tensor_out) (const Tensor& self, IntArrayRef dim, bool keep } } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool cpu_equal(const Tensor& self, const Tensor& other) { if (!at::namedinference::are_names_equal( self.unsafeGetTensorImpl(), other.unsafeGetTensorImpl())) { diff --git a/aten/src/ATen/native/ReduceOps.h b/aten/src/ATen/native/ReduceOps.h index c562bf548403b..818a69b597693 100644 --- a/aten/src/ATen/native/ReduceOps.h +++ b/aten/src/ATen/native/ReduceOps.h @@ -27,7 +27,10 @@ DECLARE_DISPATCH(reduce_fn, min_values_stub) DECLARE_DISPATCH(reduce_fn, max_values_stub) DECLARE_DISPATCH(reduce_fn, argmax_stub) DECLARE_DISPATCH(reduce_fn, argmin_stub) +<<<<<<< HEAD DECLARE_DISPATCH(reduce_fn, xor_sum_stub) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using reduce_std_var_function = void (*)(TensorIterator&, double correction, bool take_sqrt); diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index 0c66c7a632997..18e57899bfd9d 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -229,20 +229,29 @@ void replication_pad3d_backward_out_cpu_template( int pbottom = paddingSize[3]; int pfront = paddingSize[4]; int pback = paddingSize[5]; +<<<<<<< HEAD int dimc = 0; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int dimw = 3; int dimh = 2; int dimd = 1; if (input.dim() == 5) { +<<<<<<< HEAD dimc++; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dimw++; dimh++; dimd++; } /* sizes */ +<<<<<<< HEAD int64_t ichannel = input.size(dimc); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t idepth = input.size(dimd); int64_t iheight = input.size(dimh); int64_t iwidth = input.size(dimw); @@ -252,9 +261,12 @@ void replication_pad3d_backward_out_cpu_template( at::native::padding::check_valid_input<3>(input, paddingSize); +<<<<<<< HEAD TORCH_CHECK(ichannel == gradOutput.size(dimc), "gradOutput width unexpected. Expected: ", ichannel, ", Got: ", gradOutput.size(dimc)); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(owidth == gradOutput.size(dimw), "gradOutput width unexpected. Expected: ", owidth, ", Got: ", gradOutput.size(dimw)); diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index a946def225b0c..7bb7b30193465 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -107,6 +107,14 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) { storage->set_nbytes(size_bytes); } +<<<<<<< HEAD +======= +// Call the sparse implementation in SparseTensor.cpp directly. +// A dynamic dispatch here is NOT necessary, so I didn't put +// this function in native_functions.yaml +const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // TODO(VitalyFedyunin): Move it to HTML docs. // // Strides of the output tensor of `resize_as_` operator is defined by input diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index 39e203f632781..0b4dd2d51e6d8 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -15,11 +15,15 @@ namespace at::native { Scalar item(const Tensor& self) { auto numel = self.sym_numel(); +<<<<<<< HEAD TORCH_SYM_CHECK( numel.sym_eq(1), "a Tensor with ", numel, " elements cannot be converted to Scalar"); +======= + TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (self.is_sparse()) { if (self._nnz() == 0) return Scalar(0); if (self.is_coalesced()) return at::_local_scalar_dense(self._values()); diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp index 2b61bcec6a828..13e34c9c25f75 100644 --- a/aten/src/ATen/native/SegmentReduce.cpp +++ b/aten/src/ATen/native/SegmentReduce.cpp @@ -480,7 +480,11 @@ REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel) // Currently some computation is being duplicated across forward and backward. +<<<<<<< HEAD // TODO: Cache indices in forward pass to reuse in backward +======= +// TODO: Cache indices in forward pass to re-use in backward +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor _segment_reduce_backward_kernel( const Tensor& grad, const Tensor& output, diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index 15794040bf39c..b64e6d64604ed 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -346,17 +346,29 @@ template struct AbsSwitch {}; template +<<<<<<< HEAD inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch /*unused*/) { +======= +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return static_cast(data); } template +<<<<<<< HEAD inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch /*unused*/) { +======= +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return static_cast(std::abs(data)); } template +<<<<<<< HEAD inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch /*unused*/) { +======= +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return static_cast(std::abs(at::opmath_type>(data))); } diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index 44215a26018f0..2f7b649c26795 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -59,8 +59,11 @@ TORCH_META_FUNC(topk) "selected index k out of range"); int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim); TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension"); +<<<<<<< HEAD TORCH_CHECK(!self.is_complex(), " topk does not support complex dtypes on CPU"); TORCH_CHECK(!(self.scalar_type() == kBool), "topk does not support bool dtypes on CPU"); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Build the output size, which is the dim being selected set to // size k @@ -76,7 +79,15 @@ TORCH_META_FUNC2(sort, stable) (const Tensor& self, std::optional stable, int64_t dim, bool descending) { maybe_wrap_dim(dim, self.dim()); +<<<<<<< HEAD TORCH_CHECK(!self.is_complex(), " Sort does not support complex dtypes on CPU"); +======= + const auto self_dtype = self.dtype(); + TORCH_CHECK_VALUE( + self_dtype != ScalarType::ComplexFloat && + self_dtype != ScalarType::ComplexDouble, + "Sort currently does not support complex dtypes on CPU."); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // See issue: https://github.com/pytorch/pytorch/issues/65863 // Strides should be dense, so as not to allocate too much memory. diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h index d04813e60281c..d17a67bd4e16c 100644 --- a/aten/src/ATen/native/SpectralOpsUtils.h +++ b/aten/src/ATen/native/SpectralOpsUtils.h @@ -21,7 +21,11 @@ enum class fft_norm_mode { // NOTE [ Fourier Transform Conjugate Symmetry ] // // Real-to-complex Fourier transform satisfies the conjugate symmetry. That is, +<<<<<<< HEAD // assuming X is the transformed K-dimensional signal, we have +======= +// assuming X is the transformed K-dimensionsal signal, we have +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // X[i_1, ..., i_K] = X[j_i, ..., j_K]*, // diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 451869f521df2..9ebd64ad67646 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -145,6 +145,15 @@ #include #include +<<<<<<< HEAD +======= +namespace at::native { + +AdvancedIndex make_info(Tensor self, IOptTensorListRef orig); + +} // namespace at::native + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::meta { TORCH_META_FUNC(gather) @@ -469,7 +478,11 @@ static void build_index_op( TensorIteratorBase& iter, const at::native::AdvancedIndex& info, const Tensor& result) { +<<<<<<< HEAD // 'TensorIterator' needs to own the things coming from 'info', since +======= + // 'TensorIterator' needs to own the things comming from 'info', since +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // 'info' will be destroyed after the META function. TensorIteratorConfig config; // info.src is a restrided view of result @@ -1906,9 +1919,17 @@ Tensor& index_fill_( "This also applies to advanced indexing e.g. tensor[mask] = scalar"); } +<<<<<<< HEAD TORCH_CHECK( self.is_complex() || !source.isComplex(), "index_fill_(): Converting complex Scalar to non-complex type is not supported"); +======= + if (!self.is_complex() && source.isComplex()) { + TORCH_CHECK( + false, + "index_fill_(): Converting complex Scalar to non-complex type is not supported"); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Handle the case when `self` is 0-dim Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self; @@ -2145,6 +2166,7 @@ static void _scatter_via_index_put( const Tensor& src, const Tensor& mut_out, bool accumulate) { +<<<<<<< HEAD // If index is expanded with zero strides across non-scatter dimensions, // advanced indexing with the index tensor alone achieves the desired // semantics and avoids creating large intermediate tensors. @@ -2192,6 +2214,83 @@ static void _scatter_via_index_put( } } mut_out.index_put_(indices, src_view, accumulate); +======= + if (self.dim() == 1) { + torch::List> indices; + indices.reserve(1); + indices.push_back(index); + mut_out.index_put_(indices, src, accumulate); + } else { + Tensor mut_out_contig = mut_out.contiguous(); + + auto index_coords_sizes = index.sizes().vec(); + index_coords_sizes.push_back(self.dim()); + auto index_coords = at::empty( + index_coords_sizes, + at::TensorOptions().dtype(at::ScalarType::Long).device(self.device())); + + for (int64_t dim_other = 0; dim_other < self.dim(); dim_other++) { + if (dim_other == dim) { + continue; + } + auto dim_coord_vals = at::arange( + index.size(dim_other), at::TensorOptions().device(self.device())); + + for (int64_t dim_unsqueeze = 0; dim_unsqueeze < self.dim() - 1; + dim_unsqueeze++) { + dim_coord_vals = + dim_coord_vals.unsqueeze((dim_unsqueeze >= dim_other) ? -1 : 0); + } + + auto view_sizes = index.sizes().vec(); + view_sizes.push_back(1); + auto view_strides = index_coords.strides().vec(); + view_strides[self.dim()] = self.dim(); + + at::as_strided(index_coords, view_sizes, view_strides, dim_other) + .copy_(dim_coord_vals.unsqueeze(-1)); + } + + auto view_sizes = index.sizes().vec(); + view_sizes.push_back(1); + auto view_strides = index_coords.strides().vec(); + view_strides[self.dim()] = self.dim(); + + at::as_strided(index_coords, view_sizes, view_strides, dim) + .copy_(index.unsqueeze(-1)); + + Tensor index_coords_flat = index_coords.flatten(0, -2); + + // Copy mut_out_contig's strides into a tensor + // TODO: Is there a utility function that already does this? + IntArrayRef mut_out_contig_strides = mut_out_contig.strides(); + Tensor coord_strides = at::empty( + {mut_out_contig.dim()}, + TensorOptions().dtype(at::ScalarType::Long).device(at::kCPU)); + std::memcpy( + coord_strides.mutable_data_ptr(), + mut_out_contig_strides.data(), + coord_strides.nbytes()); + coord_strides = coord_strides.to(mut_out_contig.device()); + + // `index_flat` contains the 1-D indices corresponding with the + // flattened `mut_out` + Tensor index_flat = (index_coords_flat * coord_strides).sum({-1}); + Tensor mut_out_flat = mut_out_contig.flatten(); + Tensor src_flat = + at::as_strided(src, index.sizes(), src.strides()).flatten(); + + torch::List> indices; + indices.reserve(1); + indices.push_back(index_flat); + + mut_out_flat.index_put_(indices, src_flat, accumulate); + + if (!mut_out.is_contiguous()) { + mut_out.copy_(mut_out_flat.reshape(mut_out.sizes())); + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template < @@ -2674,7 +2773,11 @@ inline std::tuple _take_along_dim_helper( std::move(dim)); } +<<<<<<< HEAD inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) { +======= +static inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK( !t.defined() || t.device() == device, "Expected tensor to have ", @@ -2687,7 +2790,11 @@ inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) { ")"); } +<<<<<<< HEAD inline void checkDevice( +======= +static inline void checkDevice( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CheckedFrom c, at::ArrayRef tensors, Device device) { diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h index 6f127b711d3e8..f6d6ed40acb22 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h +++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h @@ -35,9 +35,13 @@ inline std::tuple canDispatchToMaskedFill( auto self_device = self.device(); for (const std::optional& i : indices) { if (!i.has_value() || !(*i).defined()) { +<<<<<<< HEAD if (!mask.defined()) { num_ind++; } +======= + num_ind++; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { const Tensor& index = *i; if ((index.scalar_type() != kByte && index.scalar_type() != kBool) || @@ -73,11 +77,19 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) { checkIndexTensorTypes(orig, /*allow_int*/ true); // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more // LongTensors +<<<<<<< HEAD auto indices = expandTensors(self, orig, /*ensure_same_device=*/true); // next broadcast all index tensors together try { indices = expand_outplace(indices); } catch (std::exception&) { +======= + auto indices = expandTensors(self, orig); + // next broadcast all index tensors together + try { + indices = expand_outplace(indices); + } catch (std::exception& e) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK_INDEX( false, "shape mismatch: indexing tensors could not be broadcast together" @@ -93,6 +105,15 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) { if (!hasContiguousSubspace(indices)) { std::tie(self, indices) = transposeToFront(self, indices); } +<<<<<<< HEAD +======= + // Ensure indices are on the same device as self + for (auto& indice : indices) { + if (indice.defined() && indice.device() != self.device()) { + indice = indice.to(self.device()); + } + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (auto& indice : indices) { if (indice.defined() && indice.dtype() == at::kInt) { indice = indice.to(at::kLong); diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index c6126eda61e73..1df1d4b40924b 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -73,6 +73,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #endif @@ -846,7 +850,11 @@ TORCH_IMPL_FUNC(clamp_Tensor_out) (const Tensor& self, const OptionalTensorRef min, const OptionalTensorRef max, +<<<<<<< HEAD const Tensor& /*unused*/) { +======= + const Tensor&) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (min && max) { clamp_stub(device_type(), *this); } else if (min) { diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index 7df7745fc5077..ef8065a3ff90b 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -67,7 +67,11 @@ namespace at::native { namespace { // dense_to_sparse_{csr,bsr,csc,bsc} common helpers +<<<<<<< HEAD // Preparation for the N-D dense -> sparse compressed conversion. +======= +// Preparation fo the N-D dense -> sparse compressed conversion. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // The N-D input is converted to 3-D (single batch dim) where we check that the // product of batch dims is nonzero and for each batch the sparse matrix // contained within has the same number of non-zero elements. diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 1886e65fc1edc..20349ae43ea90 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1367,9 +1367,15 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) { for (int64_t i = 0; i < n - 1; i++) { // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) int64_t z = generator->random() % (n - i); +<<<<<<< HEAD scalar_t save = r__data[i * r__stride_0]; r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0]; r__data[(z + i) * r__stride_0] = save; +======= + scalar_t sav = r__data[i * r__stride_0]; + r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0]; + r__data[(z + i) * r__stride_0] = sav; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return; } @@ -1640,9 +1646,12 @@ Tensor zeros_symint( std::optional layout, std::optional device, std::optional pin_memory) { +<<<<<<< HEAD for (const auto& dim_size : size) { TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative."); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Layout layout_ = layout.value_or(Layout::Strided); if (at::sparse_csr::is_sparse_compressed(layout_)) { return zeros_sparse_compressed_symint( diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp index ce2987eb251ae..a32782eec7763 100644 --- a/aten/src/ATen/native/TensorIteratorReduce.cpp +++ b/aten/src/ATen/native/TensorIteratorReduce.cpp @@ -80,7 +80,11 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) { } /// Chooses a dimension over which to parallelize. Prefers the outer-most +<<<<<<< HEAD /// dimension that's larger than the number of available threads. +======= +/// dimension thats larger than the number of available threads. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static int find_split_dim(TensorIteratorBase& iter) { int num_threads = at::get_num_threads(); auto shape = iter.shape(); diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index c15b082f107b2..f25335234e53c 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -18,7 +18,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -58,12 +61,15 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) { return self.sym_size(dim); } +<<<<<<< HEAD c10::SymBool sym_is_contiguous( const Tensor& self, c10::MemoryFormat memory_format) { return self.sym_is_contiguous(memory_format); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::SymInt sym_stride(const Tensor& self, int64_t dim) { return self.sym_stride(dim); } @@ -91,6 +97,12 @@ bool cudnn_is_acceptable(const TensorBase& self) { return false; if (!self.is_cuda()) return false; +<<<<<<< HEAD +======= + auto st = self.scalar_type(); + if (!(st == kDouble || st == kFloat || st == kHalf)) + return false; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (!detail::getCUDAHooks().compiledWithCuDNN()) return false; // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty @@ -117,7 +129,11 @@ Tensor& detach_(Tensor& self) { } Tensor contiguous(const Tensor& self, MemoryFormat memory_format) { +<<<<<<< HEAD if (self.is_contiguous_or_false(memory_format)) { +======= + if (self.is_contiguous(memory_format)) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self; } TORCH_CHECK( diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 6df7761d822db..794a69451f6ac 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1,5 +1,8 @@ +<<<<<<< HEAD #include #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -249,7 +252,11 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) { // Checking names before the actual dimensions. auto maybe_outnames = namedinference::compute_cat_outnames(materialized); +<<<<<<< HEAD TORCH_CHECK_VALUE( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) !materialized.empty(), "torch.cat(): expected a non-empty list of Tensors"); @@ -276,7 +283,11 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) { // when computing the actual output dtype and the flags. if (is_out_defined) { // Check for type promotion, if the output tensor is defined. +<<<<<<< HEAD TORCH_CHECK_TYPE( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) canCast(out_dtype, result.scalar_type()), "torch.cat(): input types can't be cast to the desired output type ", result.scalar_type()); @@ -295,7 +306,11 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) { // are compatible, i.e. we can execute `cat` on them. bool found_valid_tensor = valid < materialized.size(); if (found_valid_tensor) { +<<<<<<< HEAD TORCH_CHECK_INDEX( +======= + TORCH_CHECK( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dim <= materialized[valid].get().dim(), "torch.cat(): dimension ", dim, @@ -386,7 +401,11 @@ Tensor& set_storage_cpu_( result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ? at::OptionalIntArrayRef(stride) : std::nullopt; +<<<<<<< HEAD // We can reuse this kernel for the meta device. +======= + // We can re-use this kernel for the meta device. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // We just need to make sure we don't actually try to resize the (null) // storage. at::native::resize_impl_cpu_( @@ -461,7 +480,12 @@ Tensor& set_storage_meta__symint( size, stride, itemsize, std::move(storage_offset)); if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && +<<<<<<< HEAD (new_size_bytes > storage.sym_nbytes())) { +======= + TORCH_GUARD_SIZE_OBLIVIOUS( + new_size_bytes.sym_gt(storage.sym_nbytes()))) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) storage.set_nbytes(std::move(new_size_bytes)); } } @@ -507,7 +531,11 @@ Tensor& set_cpu_(Tensor& result) { return result; } +<<<<<<< HEAD // We can't reuse the cpu kernel here because we don't want to use the cpu +======= +// We can't re-use the cpu kernel here because we don't want to use the cpu +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // allocator. Tensor& set_meta_(Tensor& result) { caffe2::TypeMeta dtype = result.dtype(); @@ -1410,6 +1438,12 @@ Tensor as_strided_tensorimpl( IntArrayRef size, IntArrayRef stride, std::optional storage_offset_) { +<<<<<<< HEAD +======= + TORCH_INTERNAL_ASSERT( + !self.is_mps(), + "as_strided_tensorimpl does not work with MPS; call self.as_strided(...) instead"); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto storage_offset = storage_offset_.value_or(self.storage_offset()); auto result = at::detail::make_tensor( c10::TensorImpl::VIEW, @@ -1880,18 +1914,33 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { Tensor xtensor = self.expand(padded_size); +<<<<<<< HEAD Tensor urtensor; if (self.is_quantized()) { urtensor = at::empty_quantized(target_size, self); } else { urtensor = at::empty(target_size, self.options()); +======= + Tensor result; + if (self.is_quantized()) { + result = at::empty_quantized(target_size, self); + } else { + result = at::empty(target_size, self.options()); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // return an empty tensor if one of the repeat dimensions is zero if (zero_tensor) { +<<<<<<< HEAD return urtensor; } +======= + return result; + } + + Tensor urtensor = at::alias(result); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(xtensor.dim())) { // can't unfold with step 0, so make sure step is at least 1 // (it doesn't matter what it is in that case, because the size is 0). @@ -1901,6 +1950,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { urtensor.copy_(xtensor.expand_as(urtensor)); +<<<<<<< HEAD // Combine the dimensions to produce the target_size. // xtensor dims: [a0, ..., ad-1] // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1] @@ -1921,6 +1971,13 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { // If self.size() > len(reps), reps is promoted to self.size() by prepending +======= + return result; +} + +Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { + // If self.size() > len(reps), reps is promoted to self.size() by pre-pending +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // 1’s to it to keep the same behaviour as `numpy.tile`. // Thus for a tensor of shape (2, 3, 4, 5), a dims of (2, 2) is treated // as (1, 1, 2, 2). @@ -2011,18 +2068,32 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) { TORCH_CHECK(false, "reshape is not implemented for sparse tensors"); } +<<<<<<< HEAD if (self.is_contiguous_or_false() && !self.is_mkldnn()) { return self.view_symint(proposed_shape); } auto sym_numel = self.sym_numel(); +======= + auto sym_sizes = self.sym_sizes(); + auto sym_strides = self.sym_strides(); + auto sym_numel = self.sym_numel(); + if (definitely_contiguous(sym_sizes, sym_strides, sym_numel) && + !self.is_mkldnn()) { + return self.view_symint(proposed_shape); + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::SymDimVector shape = infer_size_dv(proposed_shape, sym_numel); if (self.is_mkldnn()) { return at::_mkldnn_reshape(self, C10_AS_INTARRAYREF_SLOW(shape)); } +<<<<<<< HEAD auto sym_sizes = self.sym_sizes(); auto sym_strides = self.sym_strides(); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // `computeStride` returns the proper strides to use if this // `reshape` can be just a view. @@ -2067,7 +2138,11 @@ Tensor _reshape_copy_symint( TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors"); } +<<<<<<< HEAD if (self.is_contiguous_or_false()) { +======= + if (self.is_contiguous()) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return self.view_symint(shape).clone(at::MemoryFormat::Contiguous); } else { return at::_unsafe_view_symint( @@ -2444,7 +2519,11 @@ Tensor index_select_sparse_cpu( const auto dim_indices = indices[dim].contiguous(); // If nnz is smaller than size, then either indices[dim] or index gets +<<<<<<< HEAD // sorted, then this is followed by a binary search to find intersections. +======= + // sorted, then this is followed by a binary search to find interesections. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto get_selected_indices_small_nnz_large_size = [&]() -> std::tuple { const auto grain_size = at::internal::GRAIN_SIZE; @@ -3641,7 +3720,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) { namespace { // Transpose implementation for sparse compressed layouts // NB: We assume that dim1,dim0 have already been wrapped +<<<<<<< HEAD inline Tensor sparse_compressed_transpose( +======= +static inline Tensor sparse_compressed_transpose( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim0, int64_t dim1) { @@ -3950,7 +4033,11 @@ Tensor squeeze_qtensor(const Tensor& self, c10::OptionalIntArrayRef dims) { quantizer->scalar_type()); } // TODO: quantized Tensor support for SymInt needs to be added but basic +<<<<<<< HEAD // building blocks are missing for now. +======= + // building blocs are missing for now. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto result = make_qtensor( self, C10_AS_INTARRAYREF_SLOW(sizes), diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp index 08b666e296ed7..fb5a4f1aca8a1 100644 --- a/aten/src/ATen/native/TriangularOps.cpp +++ b/aten/src/ATen/native/TriangularOps.cpp @@ -52,7 +52,10 @@ void apply_triu_tril_single( int64_t self_col_stride, bool upper) { constexpr int64_t zero = 0; +<<<<<<< HEAD k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (upper) { parallel_for(0, n, 0, [&](int64_t start, int64_t end) { diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h index 156d2c8974b84..aeb63441d2422 100644 --- a/aten/src/ATen/native/UnfoldBackward.h +++ b/aten/src/ATen/native/UnfoldBackward.h @@ -29,7 +29,11 @@ namespace { // grad_in does not mean that it is a gradient wrt to input, // grad_in/grad_out is just an input/output of unfold_backward kernel. +<<<<<<< HEAD [[maybe_unused]] TensorIterator _make_unfold_backward_iter_over_grad_out( +======= +[[maybe_unused]] static TensorIterator _make_unfold_backward_iter_over_grad_out( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& grad_out, const Tensor& grad_in, int64_t dim, diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index b14079e7ea19c..98fe558953c4d 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -124,7 +124,11 @@ struct IsUnique {}; template struct IsUnique { +<<<<<<< HEAD bool operator() (scalar_t* data_ptr, int64_t i) { +======= + inline bool operator() (scalar_t* data_ptr, int64_t i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (i == 0) { return true; } return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]); } @@ -132,7 +136,11 @@ struct IsUnique { template struct IsUnique { +<<<<<<< HEAD bool operator() (scalar_t* data_ptr, int64_t i) { +======= + inline bool operator() (scalar_t* data_ptr, int64_t i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (i == 0) { return true; } return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1])) && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1])); diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index cf6727c2207c7..901d8dd72fb05 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -4,6 +4,10 @@ #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -406,7 +410,11 @@ scalar_t cubic_convolution2(scalar_t x, scalar_t A) { } template +<<<<<<< HEAD static inline void get_cubic_upsample_coefficients( +======= +void get_cubic_upsample_coefficients( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t coeffs[4], scalar_t t) { scalar_t A = -0.75; diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index 3ab8795f6dca3..2df198a37e668 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -105,7 +105,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD void upsample_bicubic2d_backward_out_frame( +======= +static void upsample_bicubic2d_backward_out_frame( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* odata, scalar_t* idata, int64_t input_height, @@ -177,7 +181,11 @@ void upsample_bicubic2d_backward_out_frame( }); } +<<<<<<< HEAD void upsample_bicubic2d_backward_kernel( +======= +static void upsample_bicubic2d_backward_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp index 02c798a3d0400..9f2c8f7a5c8d9 100644 --- a/aten/src/ATen/native/VariableMethodStubs.cpp +++ b/aten/src/ATen/native/VariableMethodStubs.cpp @@ -25,11 +25,19 @@ namespace at::native { void _backward(const Tensor& self, TensorList inputs, const std::optional& gradient_opt, std::optional keep_graph, bool create_graph) { +<<<<<<< HEAD self._backward(inputs, gradient_opt, keep_graph, create_graph); } void set_data(Tensor& self, const Tensor& new_data) { self.set_data(new_data); +======= + return self._backward(inputs, gradient_opt, keep_graph, create_graph); +} + +void set_data(Tensor& self, const Tensor& new_data) { + return self.set_data(new_data); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor data(const Tensor& self) { @@ -54,7 +62,11 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) { } void retain_grad(Tensor& self) { +<<<<<<< HEAD self.retain_grad(); +======= + return self.retain_grad(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } bool retains_grad(const Tensor& self) { diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp index 0773217c90a4c..159ac68f45169 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp @@ -39,6 +39,10 @@ int register_linear_params() { } namespace { +<<<<<<< HEAD [[maybe_unused]] auto linear_params = register_linear_params(); +======= +[[maybe_unused]] static auto linear_params = register_linear_params(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace } // namespace ao::sparse diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h index 9a122cd7cf05e..6730187ebd385 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h @@ -4,11 +4,17 @@ #include #ifdef USE_FBGEMM +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") #include #include #include C10_DIAGNOSTIC_POP() +======= +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace ao::sparse { diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp index 01b292adc01c3..11f2f9ed91551 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp @@ -17,7 +17,11 @@ namespace ao::sparse { +<<<<<<< HEAD +======= +int register_linear_params(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef USE_FBGEMM @@ -128,7 +132,11 @@ at::Tensor PackedLinearWeight::apply_impl( auto* input_tr_ptr = reinterpret_cast(input_tr.data_ptr()); // TODO: Activation transpose before and after the kernel can be removed if we +<<<<<<< HEAD // keep activation tensor always transposed. +======= + // keep activation tensor always tranposed. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) fbgemm::transpose_simd( batch_size, K, input_ptr, K, input_tr_ptr, batch_size); diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp index 968e58d591c1d..91abc68c2484d 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp @@ -20,7 +20,11 @@ namespace ao::sparse { +<<<<<<< HEAD +======= +int register_linear_params(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef USE_FBGEMM namespace { diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp index b9cffe5b0bcbf..f7e2a0cdf3503 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp @@ -16,7 +16,11 @@ #endif namespace ao::sparse { +<<<<<<< HEAD +======= +int register_linear_params(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifdef USE_FBGEMM diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index bc9b452bc6876..86db66a0b7935 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -26,11 +26,15 @@ namespace at::native { namespace { +<<<<<<< HEAD #if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) // Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON __attribute__((optimize("no-tree-vectorize"))) #endif void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { +======= +static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (at::isReducedFloatingType(input.scalar_type())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { using Vec = Vectorized; @@ -96,7 +100,11 @@ void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const Tensor } } +<<<<<<< HEAD void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { +======= +static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (at::isReducedFloatingType(iter.dtype())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() { using Vec = Vectorized; @@ -150,7 +158,11 @@ void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { } } +<<<<<<< HEAD void threshold_kernel( +======= +static void threshold_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorIteratorBase& iter, const Scalar& threshold_scalar, const Scalar& value_scalar) { @@ -868,7 +880,11 @@ void hardswish_backward_kernel(TensorIterator& iter) { } } +<<<<<<< HEAD void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { +======= +static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (at::isReducedFloatingType(iter.dtype())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&]() { auto zero_vec = Vectorized((float)(0)); @@ -907,7 +923,11 @@ void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { } } +<<<<<<< HEAD void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) { +======= +static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (at::isReducedFloatingType(iter.dtype())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&]() { auto zero_vec = Vectorized((float)(0)); diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp index 3a19088114b2d..434d084d9cf2f 100644 --- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp @@ -520,7 +520,11 @@ cpu_adaptive_avg_pool3d_channels_last( scalar_t* out = output_data + i * channels; int64_t size = channels; +<<<<<<< HEAD // Note: For ordinary usage scenario, each out lane should +======= + // Note: For oridinary usage scenario, each out lane should +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // fit in L1 cache; otherwise consider block dim C. // Pass I: zero the out lane int64_t d1 = 0; diff --git a/aten/src/ATen/native/cpu/AtomicAddFloat.h b/aten/src/ATen/native/cpu/AtomicAddFloat.h index 526f86d705b77..db0bf3c40d151 100644 --- a/aten/src/ATen/native/cpu/AtomicAddFloat.h +++ b/aten/src/ATen/native/cpu/AtomicAddFloat.h @@ -22,7 +22,11 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue) old_value.floatV = *dst; new_value.floatV = old_value.floatV + fvalue; +<<<<<<< HEAD unsigned* old_intV = &old_value.intV; +======= + unsigned* old_intV = (unsigned*)(&old_value.intV); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) { #ifdef __aarch64__ __asm__ __volatile__("yield;" : : : "memory"); diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 10e0daacab33c..5725154954130 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -300,8 +300,12 @@ void div_floor_kernel(TensorIteratorBase& iter) { // In the special case of unsigned integer division, floor division is // equivalent to truncation division (since the signs of the divisor and // dividend are always the same) +<<<<<<< HEAD div_trunc_kernel(iter); return; +======= + return div_trunc_kernel(iter); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else if (isIntegralType(dtype, /*includeBool*/ false)) { // There's no SIMD integer division, so don't try to vectorize it. AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() { diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp index 2e3a82ac049e7..c7a1e2b390f2a 100644 --- a/aten/src/ATen/native/cpu/BlasKernel.cpp +++ b/aten/src/ATen/native/cpu/BlasKernel.cpp @@ -118,7 +118,11 @@ gemm_notrans_( scale_(m, n, beta, c, ldc); // c += alpha * (a @ b) +<<<<<<< HEAD const uint64_t unsigned_m = m; +======= + const uint64_t unsigned_m = static_cast(m); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const uint64_t i_m = unsigned_m / 4; for (const uint64_t l : c10::irange(k)) { for (const uint64_t j : c10::irange(n)) { @@ -369,7 +373,11 @@ void gemm_notrans_( #endif // defined(__aarch64__) && !defined(C10_MOBILE) #if !defined(C10_MOBILE) +<<<<<<< HEAD float compute_dot(const at::Half* a, const at::Half* b, int64_t len) { +======= +static float compute_dot(const at::Half* a, const at::Half* b, int64_t len) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::native::CPU_CAPABILITY::fp16_dot_with_fp32_arith( a, b, len); } @@ -406,7 +414,11 @@ void gemm_transa_( }); } +<<<<<<< HEAD float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) { +======= +static float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return at::native::CPU_CAPABILITY::bf16_dot_with_fp32_arith(a, b, len); } diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp index 365a79ba52ca9..06768175b39c7 100644 --- a/aten/src/ATen/native/cpu/CopyKernel.cpp +++ b/aten/src/ATen/native/cpu/CopyKernel.cpp @@ -15,12 +15,20 @@ namespace at::native { inline namespace CPU_CAPABILITY { namespace { +<<<<<<< HEAD bool reduced_input(ScalarType input_t, ScalarType output_t) { +======= +static bool reduced_input(ScalarType input_t, ScalarType output_t) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return !at::isFloat8Type(input_t) && at::isReducedFloatingType(input_t) && output_t == kFloat; } +<<<<<<< HEAD bool reduced_output(ScalarType input_t, ScalarType output_t) { +======= +static bool reduced_output(ScalarType input_t, ScalarType output_t) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return !at::isFloat8Type(output_t) && at::isReducedFloatingType(output_t) && input_t == kFloat; } diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp index 66e49f911f68b..0700cd2669f48 100644 --- a/aten/src/ATen/native/cpu/CrossKernel.cpp +++ b/aten/src/ATen/native/cpu/CrossKernel.cpp @@ -15,7 +15,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { +======= +static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t total = a.numel() / 3; int64_t a_stride = a.stride(dim); int64_t b_stride = b.stride(dim); @@ -68,7 +72,11 @@ void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const i }); } +<<<<<<< HEAD void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { +======= +static void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, result.scalar_type(), "cross", [&]() { apply_cross(result, a, b, dim); }); diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp index cce4c43b2e4e5..ffb0bd3ddcea7 100644 --- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp +++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp @@ -259,6 +259,7 @@ inline void winograd_f2k3_input_transform_inplace__rvv( const vfloat32m1_t wd1 = __riscv_vfadd_vv_f32m1(d1, d2, 4); const vfloat32m1_t wd2 = __riscv_vfsub_vv_f32m1(d2, d1, 4); const vfloat32m1_t wd3 = __riscv_vfsub_vv_f32m1(d1, d3, 4); +<<<<<<< HEAD /* GCC 14.2 (RISC-V RVV) ICE workaround: * Avoid single-statement read-modify-write on MEM_REF like: * *input_tile_val = @@ -273,6 +274,13 @@ inline void winograd_f2k3_input_transform_inplace__rvv( tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 2, wd2); tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 3, wd3); *input_tile_val = tmp_input_tile_val; +======= + + *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wd0); + *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wd1); + *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 2, wd2); + *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 3, wd3); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline void winograd_f2k3_output_transform_inplace__rvv( @@ -286,6 +294,7 @@ inline void winograd_f2k3_output_transform_inplace__rvv( const vfloat32m1_t wm0 = __riscv_vfadd_vv_f32m1(m0_plus_m1, m2, 4); const vfloat32m1_t m1_sub_m2 = __riscv_vfsub_vv_f32m1(m1, m2, 4); const vfloat32m1_t wm1 = __riscv_vfsub_vv_f32m1(m1_sub_m2, m3, 4); +<<<<<<< HEAD /* GCC 14.2 (RISC-V RVV) ICE workaround — see note above. * Keep the temporary + write-back pattern to avoid ICE. * Do NOT rewrite into: @@ -295,6 +304,11 @@ inline void winograd_f2k3_output_transform_inplace__rvv( tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 0, wm0); tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 1, wm1); *input_tile_val = tmp_output_tile_val; +======= + + *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wm0); + *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wm1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline vfloat32m1_t @@ -315,6 +329,7 @@ inline void winograd_f2k3_kernel_transform__rvv( const vfloat32m1_t const_half = __riscv_vfmv_v_f_f32m1(0.5f, 4); const vfloat32m1_t g0_plus_g2 = __riscv_vfadd_vv_f32m1(g0, g2, 4); vfloat32m1_t half_g0_plus_g2 = __riscv_vfmul_vv_f32m1(const_half, g0_plus_g2, 4); +<<<<<<< HEAD /* GCC 14.2 (RISC-V RVV) ICE workaround — see note above. * Keep the temporary + write-back pattern to avoid ICE. * Do NOT rewrite into: @@ -326,6 +341,13 @@ inline void winograd_f2k3_kernel_transform__rvv( tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1)); tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 3, g2); *transform = tmp_transform; +======= + + *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 0, g0); + *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 1, vmuladdq_f32(half_g0_plus_g2, const_half, g1)); + *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1)); + *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 3, g2); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } inline vfloat32m1x4_t v4f_transpose4x4__rvv(const vfloat32m1x4_t m) { @@ -473,11 +495,19 @@ void convolution_depthwise3x3_winograd_impl( #else void convolution_depthwise3x3_winograd_impl( +<<<<<<< HEAD const Arguments& /*unused*/, const float* const /*unused*/, const float* const /*unused*/, const float* const /*unused*/, float* const /*unused*/) { +======= + const Arguments&, + const float* const, + const float* const, + const float* const, + float* const) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } #endif /* __ARM_NEON__ */ diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index e900bc5216117..6bd4bfee7d78c 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -34,7 +34,11 @@ struct Dist { // finish : This tells what to do with the aggregated value to compute // the norm. Generally this is the result of val ^ (1 / p). // backward : This is the gradient for that norm. Arguments are pretty +<<<<<<< HEAD // self explanatory. +======= + // self explanitory. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // There are a few cases where these aren't used. The 0 norm has no backward, // because it's always 0, so that's shortcircuited earlier. There's a special @@ -139,7 +143,11 @@ struct Dist { static inline data_t map(const data_t& diff, const data_t& p) { return diff; } static inline data_t red(const data_t& agg, const data_t& up) { return max(agg, up); } static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; } +<<<<<<< HEAD // TODO This backward pass uses a very complex expression to compute (diff +======= + // TODO This backward pass uses a very complext expression to compute (diff +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // == dist) that could be much faster if using SSE instructions. static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff) * (Vec(1) - vec::minimum(Vec(1), (diff.abs() - Vec(dist)).abs().ceil())); } }; @@ -160,9 +168,16 @@ struct Dist { // value of k. parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) { const Vec pvec(p); +<<<<<<< HEAD double n2 = static_cast(n) - .5; // The -1 accounts for floating point truncation issues int64_t i = static_cast((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast(k) - 1.0))); +======= + double n2 = n - .5; + // The -1 accounts for floating point truncation issues + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + int64_t i = static_cast((n2 - std::sqrt(n2 * n2 - 2 * k - 1))); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t j = k - n * i + i * (i + 1) / 2 + i + 1; const scalar_t * self_i = self_start + i * m; @@ -421,19 +436,31 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double }); } +<<<<<<< HEAD void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) { +======= +static void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_backward", [&] { Dist::apply_backward_pdist(result, grad, self, p, dist); }); } +<<<<<<< HEAD void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) { +======= +static void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist", [&] { Dist::apply_cdist(result, x1, x2, p); }); } +<<<<<<< HEAD void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) { +======= +static void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist_backward", [&] { Dist::apply_backward_cdist(result, grad, x1, x2, p, dist); }); diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp index e3fdefb523044..ead8515216baa 100644 --- a/aten/src/ATen/native/cpu/DistributionKernels.cpp +++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp @@ -27,7 +27,11 @@ namespace at::native { namespace { +<<<<<<< HEAD void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { +======= +static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::cauchy_kernel(iter, median, sigma, generator); } @@ -101,7 +105,11 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional gen) { +======= +static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::exponential_kernel(iter, lambda, generator); } @@ -198,12 +206,20 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional gen) { +======= +static void geometric_kernel(TensorIteratorBase& iter, double p, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::geometric_kernel(iter, p, generator); } +<<<<<<< HEAD void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional gen) { +======= +static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::log_normal_kernel(iter, mean, std, generator); } @@ -218,12 +234,20 @@ void normal_kernel(const TensorBase &self, double mean, double std, std::optiona templates::cpu::normal_kernel(self, mean, std, generator); } +<<<<<<< HEAD void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { +======= +static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_from_to_kernel(iter, range, base, generator); } +<<<<<<< HEAD void random_kernel(TensorIteratorBase& iter, std::optional gen) { +======= +static void random_kernel(TensorIteratorBase& iter, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_kernel(iter, generator); } @@ -231,7 +255,11 @@ void random_kernel(TensorIteratorBase& iter, std::optional gen) { // This is the special kernel to handle single specific case: // from(inclusive) = std::numeric_limits::lowest() // to(exclusive) = None (= std::numeric_limits::max() + 1) +<<<<<<< HEAD void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional gen) { +======= +static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional gen) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_full_64_bits_range_kernel(iter, generator); } diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 1f8693902a32b..63a5f751caaad 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -85,7 +85,11 @@ struct RandomKernel { // ==================================================== Normal ======================================================== #ifdef CPU_CAPABILITY_AVX2 +<<<<<<< HEAD void normal_fill_16_AVX2(float *data, +======= +static void normal_fill_16_AVX2(float *data, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const __m256* two_pi, const __m256* one, const __m256* minus_two, @@ -136,7 +140,11 @@ void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, #endif template +<<<<<<< HEAD void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { +======= +static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto j : c10::irange(8)) { const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log. const scalar_t u2 = data[j + 8]; diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp index 5ac4971396076..4b4a2c8c256fc 100644 --- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp +++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp @@ -96,6 +96,7 @@ inline void _exp_reduce_sum_fusion_kernel( for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { auto tmp0 = vec::Vectorized::loadu(a + i); auto tmp1 = tmp0 - vec_max; +<<<<<<< HEAD Vectorized tmp2; if constexpr (std::is_same_v && (std::is_same_v || std::is_same_v)) @@ -104,6 +105,9 @@ inline void _exp_reduce_sum_fusion_kernel( } else { tmp2 = tmp1.exp_u20(); } +======= + auto tmp2 = tmp1.exp_u20(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) vec_tmp_sum += tmp2; _store(out + i, tmp2); } @@ -158,14 +162,22 @@ inline void _mul_reduce_max_fusion_kernel( } template +<<<<<<< HEAD inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { +======= +static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(ptr2 == nullptr); return ptr; } template , int> = 0> +<<<<<<< HEAD inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { +======= +static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return ptr2; } @@ -315,12 +327,21 @@ void cpu_flash_attention( bool is_causal, std::optional attn_mask, std::optional scale) { +<<<<<<< HEAD // Query (Batch x Num_heads x Q_seq_len x Dim_per_head) // -> (Batch x Q_seq_len x Num_heads x Dim_per_head) // Key (Batch x KV_num_heads x KV_seq_len x Dim_per_head) // -> (Batch x KV_seq_len x KV_num_heads x Dim_per_head) // Value (Batch x KV_num_heads x KV_seq_len x Dim_per_head) // -> (Batch x KV_seq_len x KV_num_heads x Dim_per_head) +======= + // Query (Batch x Num_heads x Q_seq_len x Dim_per_head) + // -> (Batch x Q_seq_len x Num_heads x Dim_per_head) + // Key (Batch x Num_heads x KV_seq_len x Dim_per_head) + // -> (Batch x KV_seq_len x Num_heads x Dim_per_head) + // Value (Batch x Num_heads x KV_seq_len x Dim_per_head) + // -> (Batch x KV_seq_len x Num_heads x Dim_per_head) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::Tensor query = q.transpose(1, 2); at::Tensor key = k.transpose(1, 2); at::Tensor value = v.transpose(1, 2); @@ -338,8 +359,11 @@ void cpu_flash_attention( int64_t qSize = query.size(1); int64_t kvSize = value.size(1); int64_t num_head = query.size(2); +<<<<<<< HEAD int64_t kv_num_head = key.size(2); int64_t repeat_factor = num_head / kv_num_head; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t headSize = query.size(3); bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel(); @@ -400,7 +424,11 @@ void cpu_flash_attention( // When the number of gemm is greater than the number of pack, // the pack overhead can be overlapped. if (need_pack) { +<<<<<<< HEAD double pack_size = batchSize * kv_num_head * kvSize * headSize; +======= + double pack_size = batchSize * num_head * kvSize * headSize; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) double qs_per_thread = (batchSize * num_head * qSlice + num_thread - 1) / num_thread; double gemm_size_per_thread = qs_per_thread * qSplitSize * (is_causal ? std::min(qSize, kvSize) : kvSize) * headSize; @@ -450,10 +478,17 @@ void cpu_flash_attention( at::Tensor qeury_t_padding; if (need_pack) { key_t_reorder = at::empty( +<<<<<<< HEAD {batchSize, kv_num_head, eheadSize, kvSize}, c10::CppTypeToScalarType::value); value_t_reorder = at::empty( {batchSize, kv_num_head, kv_padding_size, headSize}, +======= + {batchSize, num_head, eheadSize, kvSize}, + c10::CppTypeToScalarType::value); + value_t_reorder = at::empty( + {batchSize, num_head, kv_padding_size, headSize}, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::CppTypeToScalarType::value); key_reorder_ptr = key_t_reorder.data_ptr(); value_reorder_ptr = value_t_reorder.data_ptr(); @@ -472,11 +507,19 @@ void cpu_flash_attention( {num_thread, kvSplitSize, headSize}, c10::CppTypeToScalarType::value); scalar_t* transpose_buffer_ptr = tranpose_t_reorder.data_ptr(); +<<<<<<< HEAD at::parallel_for(0, batchSize * kv_num_head * kvSlice, 1, [&](int64_t begin, int64_t end) { int ompIdx = at::get_thread_num(); int64_t i = 0, kv_j = 0, l = 0, n = 0; scalar_t* transpose_ptr = transpose_buffer_ptr + ompIdx * kvSplitSize * headSize; at::native::data_index_init(begin, i, batchSize, kv_j, kv_num_head, l, kvSlice); +======= + at::parallel_for(0, batchSize * num_head * kvSlice, 1, [&](int64_t begin, int64_t end) { + int ompIdx = at::get_thread_num(); + int64_t i = 0, j = 0, l = 0, n = 0; + scalar_t* transpose_ptr = transpose_buffer_ptr + ompIdx * kvSplitSize * headSize; + at::native::data_index_init(begin, i, batchSize, j, num_head, l, kvSlice); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for ([[maybe_unused]] auto z : c10::irange(begin, end)) { n = l * kvSplitSize; int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n); @@ -486,7 +529,11 @@ void cpu_flash_attention( kvBlockSize, headSize, /* src_ptr */ +<<<<<<< HEAD reinterpret_cast(k_data + i * kStrideB + kv_j * kStrideH + n * kStrideN), +======= + reinterpret_cast(k_data + i * kStrideB + j * kStrideH + n * kStrideN), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* ld_src */ kStrideN, /* dst */ reinterpret_cast(transpose_ptr), /* ld_dst */ kvBlockSize); @@ -494,24 +541,40 @@ void cpu_flash_attention( // Pack [headSize, kvBlockSize] at::vec::pack_vnni2( /* src */ reinterpret_cast(transpose_ptr), +<<<<<<< HEAD /* dst */ reinterpret_cast(key_reorder_ptr + i * kv_num_head * eheadSize * kvSize + kv_j * eheadSize * kvSize + n * eheadSize), +======= + /* dst */ reinterpret_cast(key_reorder_ptr + i * num_head * eheadSize * kvSize + + j * eheadSize * kvSize + n * eheadSize), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* ld_src */ kvBlockSize, /* K */ headSize, /* N */ kvBlockSize); // Pack [kvBlockSize, headSize] at::vec::pack_vnni2( +<<<<<<< HEAD /* src */ reinterpret_cast(v_data + i * vStrideB + kv_j * vStrideH + n * vStrideN), /* dst */ reinterpret_cast(value_reorder_ptr + i * kv_num_head * kv_padding_size * headSize + kv_j * kv_padding_size * headSize + n * headSize), +======= + /* src */ reinterpret_cast(v_data + i * vStrideB + j * vStrideH + n * vStrideN), + /* dst */ reinterpret_cast(value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + n * headSize), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) /* ld_src */ vStrideN, /* K */ kvBlockSize, /* N */ headSize); // Move to the next query +<<<<<<< HEAD at::native::data_index_step(i, batchSize, kv_j, kv_num_head, l, kvSlice); +======= + at::native::data_index_step(i, batchSize, j, num_head, l, kvSlice); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }); } @@ -533,7 +596,10 @@ void cpu_flash_attention( for ([[maybe_unused]] auto z : c10::irange(begin, end)) { int64_t m = k * qSplitSize; int64_t qBlockSize = std::min(qSplitSize, qSize - m); +<<<<<<< HEAD int64_t kv_j = j / repeat_factor; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Initialize max and sum fill_stub(qk_max_data, -std::numeric_limits::infinity(), qBlockSize); @@ -570,8 +636,13 @@ void cpu_flash_attention( !headSize_even ? query_t_padding_ptr : q_data + i * qStrideB + j * qStrideH + m * qStrideM, +<<<<<<< HEAD key_reorder_ptr + i * kv_num_head * eheadSize * kvSize + kv_j * eheadSize * kvSize + n * eheadSize, +======= + key_reorder_ptr + i * num_head * eheadSize * kvSize + + j * eheadSize * kvSize + n * eheadSize, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) qk_data); } } else { @@ -582,7 +653,11 @@ void cpu_flash_attention( qBlockSize, headSize, static_cast(1), +<<<<<<< HEAD k_data + i * kStrideB + kv_j * kStrideH + +======= + k_data + i * kStrideB + j * kStrideH + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) n * kStrideN, kStrideN, q_data + i * qStrideB + j * qStrideH + @@ -701,8 +776,13 @@ void cpu_flash_attention( n > 0, qk_reduced_data, value_reorder_ptr + +<<<<<<< HEAD i * kv_num_head * kv_padding_size * headSize + kv_j * kv_padding_size * headSize + psize * headSize, +======= + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) dst_data); } } else { @@ -713,7 +793,11 @@ void cpu_flash_attention( qBlockSize, kvBlockSize, static_cast(1), +<<<<<<< HEAD v_data + i * vStrideB + kv_j * vStrideH + +======= + v_data + i * vStrideB + j * vStrideH + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) n * vStrideN, vStrideN, conditional_data_ptr(qk_data, qk_reduced_data), @@ -778,15 +862,24 @@ void cpu_flash_attention_backward( // Sizes TORCH_CHECK((query.size(3) == value.size(3)) && (key.size(3) == value.size(3)), "scaled_dot_product_attention_flash_attention_backward: Q/K/V should have the same head size"); +<<<<<<< HEAD // Query (Batch x Q_seq_len x Num_heads x Dim_per_head) // Key (Batch x KV_seq_len x KV_num_heads x Dim_per_head) // Value (Batch x KV_seq_len x KV_num_heads x Dim_per_head) +======= + // Query (Batch x Q_seq_len x Num_heads x Dim_per_head) + // Key (Batch x KV_seq_len x Num_heads x Dim_per_head) + // Value (Batch x KV_seq_len x Num_heads x Dim_per_head) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t batchSize = query.size(0); int64_t qSize = query.size(1); int64_t kvSize = value.size(1); int64_t num_head = query.size(2); +<<<<<<< HEAD int64_t kv_num_head = key.size(2); int64_t repeat_factor = num_head / kv_num_head; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t headSize = query.size(3); bool has_attn_mask = attn_mask.has_value() && attn_mask.value().numel(); @@ -877,9 +970,15 @@ void cpu_flash_attention_backward( accum_t* buf_data = buf.data_ptr(); scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr() : nullptr; +<<<<<<< HEAD at::parallel_for(0, batchSize * kv_num_head, 1, [&](int64_t begin, int64_t end) { int64_t i = 0, kv_j = 0; data_index_init(begin, i, batchSize, kv_j, kv_num_head); +======= + at::parallel_for(0, batchSize * num_head, 1, [&](int64_t begin, int64_t end) { + int64_t i = 0, j = 0; + data_index_init(begin, i, batchSize, j, num_head); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int ompIdx = at::get_thread_num(); accum_t* buf_ptr = buf_data + ompIdx * size_per_thread; accum_t* attn_data = buf_ptr; @@ -891,6 +990,7 @@ void cpu_flash_attention_backward( at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype)); accum_t* dsum_data = dsum.data_ptr(); for ([[maybe_unused]] auto z : c10::irange(begin, end)) { +<<<<<<< HEAD for (int64_t r = 0; r < repeat_factor; r++) { int64_t j = kv_j * repeat_factor + r; // rowsum of grad_out * out @@ -1084,6 +1184,198 @@ void cpu_flash_attention_backward( } // Move to the next query data_index_step(i, batchSize, kv_j, kv_num_head); +======= + // rowsum of grad_out * out + for (int64_t m = 0; m < qSize; m += qSplitSize) { + int64_t qBlockSize = std::min(qSplitSize, qSize - m); + // dsum <- rowsum(grad_out * out) + for (const auto row : c10::irange(qBlockSize)) { + *(dsum_data + row) = vec::map2_reduce_all( + [](Vec x, Vec y) { return x * y; }, + [](Vec x, Vec y) { return x + y; }, + grad_out_data + i * grad_oStrideB + j * grad_oStrideH + (m + row) * grad_oStrideM, + out_data + i * oStrideB + j * oStrideH + (m + row) * oStrideM, + headSize); + } + int64_t num_keys = is_causal ? std::min(m + qBlockSize, kvSize) : kvSize; + for (int64_t n = 0; n < num_keys; n += kvSplitSize) { + int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n); + // attn <- scale * q @ k.T + cpublas::gemm( + TransposeType::Transpose, + TransposeType::NoTranspose, + kvBlockSize, + qBlockSize, + headSize, + scaling_factor, + k_data + i * kStrideB + j * kStrideH + + n * kStrideN, + kStrideN, + q_data + i * qStrideB + j * qStrideH + + m * qStrideM, + qStrideM, + static_cast(0), + attn_data, + kvBlockSize); + // attn <- attn + mask + if (has_attn_mask) { + accum_t one = accum_t(1); + for (const auto row : c10::irange(qBlockSize)) { +#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE) + _scale_attn_mask_fusion_kernel( + attn_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM + (mStrideN == 0 ? 0 : n), + kvBlockSize, + attn_data + row * kvBlockSize, + one, + mStrideN == 0); +#else + if (mStrideN == 0) { + _scale_attn_mask_fusion_kernel( + attn_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM, + kvBlockSize, + attn_data + row * kvBlockSize, + one); + } else { + _scale_attn_mask_fusion_kernel( + attn_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM + n, + kvBlockSize, + attn_data + row * kvBlockSize, + one); + } +#endif + } + } + // restore self attention after softmax from logsumexp + // attn <- exp(attn - normalizer) + for (const auto row : c10::irange(qBlockSize)) { + accum_t normalizer = lse_data[i * lStrideB + j * lStrideH + (m + row) * lStrideM]; + vec::map( + [normalizer](Vec x) { return (x - Vec(normalizer)).exp(); }, + attn_data + row * kvBlockSize, + attn_data + row * kvBlockSize, + kvBlockSize); + } + // Apply causal mask, filled unused with 0 + if (is_causal && num_keys - n <= kvSplitSize) { + for (const auto row : c10::irange(qBlockSize)) { + int64_t last_col = m + row - n; + accum_t* row_ptr = attn_data + row * kvBlockSize; + fill_stub(row_ptr + last_col + 1, static_cast(0), kvBlockSize - last_col - 1); + } + } +#ifdef _MSC_VER + if (is_reduced_type) { +#else + if constexpr (is_reduced_type) { +#endif + for (const auto row : c10::irange(qBlockSize)) { + convert( + attn_data + row * kvBlockSize, + attn_reduced_data + row * kvBlockSize, + kvBlockSize); + } + } + // grad_v <- grad_v + attn.T @ grad_out + cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::Transpose, + headSize, + kvBlockSize, + qBlockSize, + static_cast(1), + grad_out_data + i * grad_oStrideB + j * grad_oStrideH + + m * grad_oStrideM, + grad_oStrideM, + conditional_data_ptr(attn_data, attn_reduced_data), + kvBlockSize, + static_cast(1), + grad_v_data + i * grad_vStrideB + j * grad_vStrideH + + n * grad_vStrideN, + grad_vStrideN); + // grad_attn <- grad_out @ v.T + cpublas::gemm( + TransposeType::Transpose, + TransposeType::NoTranspose, + kvBlockSize, + qBlockSize, + headSize, + static_cast(1), + v_data + i * vStrideB + j * vStrideH + + n * vStrideN, + vStrideN, + grad_out_data + i * grad_oStrideB + j * grad_oStrideH + + m * grad_oStrideM, + grad_oStrideM, + static_cast(0), + grad_attn_data, + kvBlockSize); + // grad_attn <- attn * (grad_attn - dsum) + for (const auto row : c10::irange(qBlockSize)) { + accum_t d = *(dsum_data + row); + vec::map2( + [d](Vec attn, Vec grad_attn) { return attn * (grad_attn - Vec(d)); }, + grad_attn_data + row * kvBlockSize, + attn_data + row * kvBlockSize, + grad_attn_data + row * kvBlockSize, + kvBlockSize); + } +#ifdef _MSC_VER + if (is_reduced_type) { +#else + if constexpr (is_reduced_type) { +#endif + for (const auto row : c10::irange(qBlockSize)) { + convert( + grad_attn_data + row * kvBlockSize, + grad_attn_reduced_data + row * kvBlockSize, + kvBlockSize); + } + } + // grad_q <- grad_q + scale * grad_attn @ k + cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::NoTranspose, + headSize, + qBlockSize, + kvBlockSize, + scaling_factor, + k_data + i * kStrideB + j * kStrideH + + n * kStrideN, + kStrideN, + conditional_data_ptr(grad_attn_data, grad_attn_reduced_data), + kvBlockSize, + static_cast(1), + grad_q_data + i * grad_qStrideB + j * grad_qStrideH + + m * grad_qStrideM, + grad_qStrideM); + // grad_k <- grad_k + scale * grad_attn.T @ q + cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::Transpose, + headSize, + kvBlockSize, + qBlockSize, + scaling_factor, + q_data + i * qStrideB + j * qStrideH + + m * qStrideM, + qStrideM, + conditional_data_ptr(grad_attn_data, grad_attn_reduced_data), + kvBlockSize, + static_cast(1), + grad_k_data + i * grad_kStrideB + j * grad_kStrideH + + n * grad_kStrideN, + grad_kStrideN); + } + } + // Move to the next query + data_index_step(i, batchSize, j, num_head); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } }); } diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 7587988528ebb..702c9f73db48c 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -441,7 +441,11 @@ struct ComputeLocation // See NOTE [ Grid Sample CPU Kernels ] for details. template +<<<<<<< HEAD inline void +======= +static inline void +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) mask_scatter_add(const scalar_t *src, scalar_t* base_addr, const int_same_size_t *offsets, const int_same_size_t *mask, int64_t len) { @@ -1030,7 +1034,11 @@ struct ApplyGridSample +<<<<<<< HEAD inline void grid_sample_2d_grid_slice_iterator( +======= +static inline void grid_sample_2d_grid_slice_iterator( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const TensorAccessor& grid_slice, const ApplyFn &apply_fn) { int64_t out_H = grid_slice.size(0); int64_t out_W = grid_slice.size(1); diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp index 261683a187b8a..d48a83b8e3dac 100644 --- a/aten/src/ATen/native/cpu/HistogramKernel.cpp +++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp @@ -259,7 +259,11 @@ void histogramdd_out_cpu_template(const Tensor& self, const std::optional& weight, bool density, +======= +static void histogramdd_kernel_impl(const Tensor& self, const std::optional& weight, bool density, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& hist, const TensorList& bin_edges) { histogramdd_out_cpu_template(self, weight, density, hist, bin_edges); } @@ -269,7 +273,11 @@ void histogramdd_kernel_impl(const Tensor& self, const std::optional& we * * Refer to histogramdd_out_cpu_template for more details. */ +<<<<<<< HEAD void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional& weight, +======= +static void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional& weight, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool density, Tensor& hist, const TensorList& bin_edges, bool local_search) { if (local_search) { // histogramdd codepath: both hist and bin_edges are eventually returned as output, @@ -298,7 +306,11 @@ void infer_bin_edges_from_input(const Tensor& input, const int64_t N, std::copy(max_data, max_data + N, rightmost_edges.begin()); } +<<<<<<< HEAD void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N, +======= +static void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) std::vector &leftmost_edges, std::vector &rightmost_edges) { AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "histogramdd", [&]() { infer_bin_edges_from_input(input, N, leftmost_edges, rightmost_edges); diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp index 57d3ab89c6174..efe05a5611dcb 100644 --- a/aten/src/ATen/native/cpu/IndexKernel.cpp +++ b/aten/src/ATen/native/cpu/IndexKernel.cpp @@ -749,6 +749,7 @@ void flip_kernel(TensorIterator& iter, const bool quantized) { // }); if (iter_dtype == kByte) { +<<<<<<< HEAD cpu_hflip_vec(iter); return; } else if (iter_dtype == kChar) { @@ -772,6 +773,23 @@ void flip_kernel(TensorIterator& iter, const bool quantized) { } else if (iter_dtype == kDouble) { cpu_hflip_vec(iter); return; +======= + return cpu_hflip_vec(iter); + } else if (iter_dtype == kChar) { + return cpu_hflip_vec(iter); + } else if (iter_dtype == kInt) { + return cpu_hflip_vec(iter); + } else if (iter_dtype == kLong) { + return cpu_hflip_vec(iter); + } else if (iter_dtype == kShort) { + return cpu_hflip_vec(iter); + } else if (iter_dtype == kBool) { + return cpu_hflip_vec(iter); + } else if (iter_dtype == kFloat) { + return cpu_hflip_vec(iter); + } else if (iter_dtype == kDouble) { + return cpu_hflip_vec(iter); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below) @@ -786,12 +804,19 @@ void flip_kernel(TensorIterator& iter, const bool quantized) { c == input_strides_2[1] && c == iter.element_size(0) * iter.shape()[0] // checks if dim=1 is contiguous as well ) { +<<<<<<< HEAD cpu_hflip_channels_last_vec(iter); return; } // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec) cpu_vflip_memcpy(iter); return; +======= + return cpu_hflip_channels_last_vec(iter); + } + // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec) + return cpu_vflip_memcpy(iter); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu", diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index aad618a258a37..a0bd72698c138 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -46,7 +46,11 @@ using namespace vec; template typename traits::ArgsTuple dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, +<<<<<<< HEAD std::index_sequence /*unused*/) { +======= + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return std::make_tuple( c10::load::type>( data[INDEX] + i * strides[INDEX])...); @@ -65,7 +69,11 @@ dereference_vec_impl(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i, +<<<<<<< HEAD std::index_sequence /*unused*/) { +======= + std::index_sequence) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = typename traits::result_type; using scalar_t = typename Vec::value_type; return std::make_tuple( @@ -89,7 +97,11 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t using result_type = typename traits::result_type; for (; i < n; i++) { result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); +<<<<<<< HEAD *out_ptr = std::apply(op, dereference( +======= + *out_ptr = c10::guts::apply(op, dereference( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) &data[1], &strides[1], i)); @@ -102,7 +114,11 @@ inline void execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { using traits = function_traits; for (; i < n; i++) { +<<<<<<< HEAD std::apply(op, dereference( +======= + c10::guts::apply(op, dereference( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) &data[0], &strides[0], i)); @@ -162,7 +178,11 @@ void handle_tuple_outputs(char* C10_RESTRICT data[], } // Loop operation for `cpu_kernel_multiple_outputs`. +<<<<<<< HEAD // 1. Use `std::apply` to make dynamic method invocation +======= +// 1. Use `c10::guts::apply` to make dynamic method invocation +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // for the lambda passed in `cpu_kernel_multiple_outputs`. // 2. Iterate over the members of the returned tuple, set the corresponding // output tensor by the tuple member in `handle_tuple_outputs` function. @@ -183,7 +203,11 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_ } for (; i < n; i++) { +<<<<<<< HEAD auto output = std::apply(op, dereference( +======= + auto output = c10::guts::apply(op, dereference( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) &data[num_outputs], &strides[num_outputs], i)); @@ -213,8 +237,13 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { auto args1 = dereference_vec(&data[1], opt_scalar, S, i); auto args2 = dereference_vec(&data[1], opt_scalar, S, i + Vec::size()); +<<<<<<< HEAD auto out1 = std::apply(vop, std::move(args1)); auto out2 = std::apply(vop, std::move(args2)); +======= + auto out1 = c10::guts::apply(vop, std::move(args1)); + auto out2 = c10::guts::apply(vop, std::move(args2)); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) out1.store(data[0] + i * sizeof(scalar_t)); out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); } @@ -231,7 +260,11 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve template inline void unroll_contiguous_scalar_checks( const int64_t* /*strides*/, +<<<<<<< HEAD std::index_sequence<> /*unused*/, +======= + std::index_sequence<>, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cb_t&& cb) { cb(0); } @@ -239,7 +272,11 @@ inline void unroll_contiguous_scalar_checks( template inline void unroll_contiguous_scalar_checks( const int64_t* strides, +<<<<<<< HEAD std::index_sequence /*unused*/, +======= + std::index_sequence, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cb_t&& cb) { if (is_contiguous_scalar(strides)) { cb(INDEX0 + 1); diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp index a888b8fa801c5..7bf6841d47c9b 100644 --- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp @@ -30,7 +30,11 @@ vec::Vectorized is_nan_vec(vec::Vectorized vec) { return vec.isnan(); } +<<<<<<< HEAD // TODO: use is_integral/is_same to check the scalar_t and simplify the implementation +======= +// TODO: use is_integeral/is_same to check the scalar_t and simplify the implementation +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // currently it does not work template <> vec::Vectorized is_nan_vec(vec::Vectorized vec) { diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp index fca7d8bdce5ae..de0f640094a62 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -85,11 +85,19 @@ void cpu_max_unpool( if constexpr (is_3d) { TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), " (output volumes are of size ", output_depth, +<<<<<<< HEAD "x", output_height, "x", output_width, ")"); } else { TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), " (output volumes are of size ", output_height, "x", output_width, ")"); +======= + "x", output_height, "x", output_width); + } else { + TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), + " (output volumes are of size ", output_height, + "x", output_width); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index 7ea8e87e28b1b..bfb8417759702 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -210,7 +210,11 @@ multinomial_with_replacement_apply( } } +<<<<<<< HEAD void multinomial_with_replacement_kernel_impl( +======= +static void multinomial_with_replacement_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& result, const Tensor& self, const int64_t n_sample, diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp index 853fc959f6345..fb0af34aead7b 100644 --- a/aten/src/ATen/native/cpu/PaddingKernel.cpp +++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp @@ -96,7 +96,11 @@ struct ReplicationPad { }; template +<<<<<<< HEAD inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) { +======= +static inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { @@ -112,7 +116,11 @@ inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) { } template +<<<<<<< HEAD inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) { +======= +static inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using Vec = Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { @@ -156,7 +164,11 @@ void cpu_padding( int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0; int64_t offset_w = p.offsets[ndim - 1]; +<<<<<<< HEAD // do vectorized copy when output is overlapped with input on W, +======= + // do vectorized copy whe output is overlapped with input on W, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // only applies to positive padding auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) { if (positive_padding) { diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp index 6fad9270bf193..f70e4e24a95d3 100644 --- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp @@ -9,7 +9,11 @@ namespace at::native { namespace { +<<<<<<< HEAD void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { +======= +static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ScalarType dtype = iter.common_dtype(); if (at::isReducedFloatingType(dtype)) { AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcmul_cpu_out", [&]() { @@ -50,7 +54,11 @@ void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { } } +<<<<<<< HEAD void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { +======= +static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ScalarType dtype = iter.common_dtype(); if (at::isReducedFloatingType(dtype)) { AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcdiv_cpu_out", [&]() { @@ -90,7 +98,11 @@ void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { } } +<<<<<<< HEAD void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) { +======= +static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ScalarType dtype = iter.dtype(0); if (dtype == kBFloat16) { auto norm_val = norm.to(); @@ -176,7 +188,11 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou } } +<<<<<<< HEAD void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) { +======= +static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ScalarType dtype = iter.dtype(0); AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "huber_backward_cpu_out", [&] { auto norm_val = norm.to(); @@ -215,7 +231,11 @@ void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double }); } +<<<<<<< HEAD void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) { +======= +static void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) ScalarType dtype = iter.dtype(0); AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "mse_backward_cpu_out", [&] { scalar_t scalar_val = value.to(); diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp index ed23503099ed3..c0839ee3e220b 100644 --- a/aten/src/ATen/native/cpu/PowKernel.cpp +++ b/aten/src/ATen/native/cpu/PowKernel.cpp @@ -96,6 +96,7 @@ static void pow_tensor_scalar_kernel( dtype == kBFloat16 || isComplexType(dtype)) { // Dispatch to fast specialization for sqrt, rsqrt and reciprocal if (exp_scalar.equal(.5)) { +<<<<<<< HEAD sqrt_kernel(iter); return; } else if (exp_scalar.equal(-0.5)) { @@ -104,6 +105,13 @@ static void pow_tensor_scalar_kernel( } else if (exp_scalar.equal(-1.0)) { reciprocal_kernel(iter); return; +======= + return sqrt_kernel(iter); + } else if (exp_scalar.equal(-0.5)) { + return rsqrt_kernel(iter); + } else if (exp_scalar.equal(-1.0)) { + return reciprocal_kernel(iter); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } } @@ -120,7 +128,11 @@ static void pow_tensor_scalar_kernel( } else if (dtype == ScalarType::Half) { [&]() { using scalar_t = +<<<<<<< HEAD c10::impl::ScalarTypeToCPPTypeT; +======= + decltype(c10::impl::ScalarTypeToCPPType::t); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto exp = exp_scalar.to(); using Vec = Vectorized; cpu_kernel_vec(iter, diff --git a/aten/src/ATen/native/cpu/README.md b/aten/src/ATen/native/cpu/README.md index 6a7ed0d12b0eb..838a3d0282ecf 100644 --- a/aten/src/ATen/native/cpu/README.md +++ b/aten/src/ATen/native/cpu/README.md @@ -74,7 +74,11 @@ it to sum up the entire array into a single value. `ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which compiler flags it is currently compiled. This allows the programmer to write +<<<<<<< HEAD generic code, which will be compiled under multiplied compilation settings. +======= +generic code, which will be compiled under multipled compilation settings. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) `../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains a generic definition of `sumImplAll`. This function allows the user to reduce diff --git a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp index b469aa5c2eee6..a38342191b8ed 100644 --- a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp +++ b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp @@ -18,7 +18,11 @@ namespace { using namespace vec; +<<<<<<< HEAD void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) { +======= +static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "arange_cpu", [&]() { using accscalar_t = at::acc_type; auto start = scalar_start.to(); @@ -42,7 +46,11 @@ void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scala }); } +<<<<<<< HEAD void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) { +======= +static void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "linspace_cpu", [&]() { // step should be of double type for all integral types using step_t = std::conditional_t, double, scalar_t>; diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp index c7eaa802af125..252298596ad96 100644 --- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp @@ -62,7 +62,11 @@ inline void reduce_all_impl( output.fill_(result); } +<<<<<<< HEAD void min_all_kernel_impl(Tensor& result, const Tensor& input) { +======= +static void min_all_kernel_impl(Tensor& result, const Tensor& input) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (input.scalar_type() == ScalarType::Bool) { TensorIterator iter = TensorIteratorConfig() .add_input(input) @@ -87,7 +91,11 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) { } } +<<<<<<< HEAD void max_all_kernel_impl(Tensor& result, const Tensor& input) { +======= +static void max_all_kernel_impl(Tensor& result, const Tensor& input) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (input.scalar_type() == ScalarType::Bool) { TensorIterator iter = TensorIteratorConfig() .add_input(input) @@ -167,7 +175,11 @@ inline void reduce_all_impl_vec_two_outputs( output2.fill_(result.second); } +<<<<<<< HEAD void aminmax_allreduce_kernel( +======= +static void aminmax_allreduce_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& input, Tensor& min_result, Tensor& max_result) { diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 2e62936501948..e1d0689713420 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -28,7 +28,11 @@ namespace at::native { namespace { using namespace vec; template +<<<<<<< HEAD inline void cpu_cum_base_kernel(const Tensor& result, +======= +static inline void cpu_cum_base_kernel(const Tensor& result, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim, const func_t& f, @@ -76,7 +80,11 @@ inline void cpu_cum_base_kernel(const Tensor& result, iter.for_each(loop, grain_size); } +<<<<<<< HEAD void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { +======= +static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); @@ -95,7 +103,11 @@ void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { }); } +<<<<<<< HEAD void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { +======= +static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); @@ -114,7 +126,11 @@ void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { }); } +<<<<<<< HEAD void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) { +======= +static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); @@ -135,7 +151,11 @@ void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) { }); } +<<<<<<< HEAD void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) { +======= +static void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] { binary_kernel_reduce( iter, @@ -148,7 +168,11 @@ void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt }); } +<<<<<<< HEAD void prod_kernel_impl(TensorIterator& iter) { +======= +static void prod_kernel_impl(TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Workaround for the error: '*' in boolean context, suggest '&&' instead if (iter.dtype() == ScalarType::Bool) { using scalar_t = bool; @@ -203,7 +227,11 @@ void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) { } } +<<<<<<< HEAD void norm_kernel_tensor_iterator_impl( +======= +static void norm_kernel_tensor_iterator_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorIterator& iter, const Scalar& p) { double val = 0; @@ -256,10 +284,17 @@ void norm_kernel_tensor_iterator_impl( } else { if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) { // type promotion that does cast and reduction in a single kernel +<<<<<<< HEAD norm_kernel_cpu_impl(iter, val); return; } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) { // type promotion that does cast and reduction in a single kernel norm_kernel_cpu_impl(iter, val); return; +======= + return norm_kernel_cpu_impl(iter, val); + } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) { + // type promotion that does cast and reduction in a single kernel + return norm_kernel_cpu_impl(iter, val); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] { @@ -274,7 +309,11 @@ void norm_kernel_tensor_iterator_impl( } } +<<<<<<< HEAD void and_kernel_impl(TensorIterator& iter) { +======= +static void and_kernel_impl(TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (iter.dtype() == ScalarType::Byte) { // Refer [all, any : uint8 compatibility] binary_kernel_reduce_vec( @@ -312,7 +351,11 @@ void and_kernel_impl(TensorIterator& iter) { } } +<<<<<<< HEAD void or_kernel_impl(TensorIterator& iter) { +======= +static void or_kernel_impl(TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (iter.dtype() == ScalarType::Byte) { // Refer [all, any : uint8 compatibility] binary_kernel_reduce_vec( @@ -346,7 +389,11 @@ struct MinValuesOps: public at::native::MinOps { } }; +<<<<<<< HEAD void min_values_kernel_impl(TensorIterator& iter) { +======= +static void min_values_kernel_impl(TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (iter.dtype() == kLong) { // This case is special because of Vectorized does not // handle upper_bound(). @@ -367,7 +414,11 @@ void min_values_kernel_impl(TensorIterator& iter) { }); } +<<<<<<< HEAD void max_values_kernel_impl(TensorIterator& iter) { +======= +static void max_values_kernel_impl(TensorIterator& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] { binary_kernel_reduce_vec( iter, @@ -377,7 +428,11 @@ void max_values_kernel_impl(TensorIterator& iter) { }); } +<<<<<<< HEAD void argmax_kernel_impl(TensorIterator &iter) { +======= +static void argmax_kernel_impl(TensorIterator &iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmax_cpu", [&] { if (is_reduce_lastdim(iter)) { using arg_t = std::pair; @@ -401,7 +456,11 @@ void argmax_kernel_impl(TensorIterator &iter) { }); } +<<<<<<< HEAD void argmin_kernel_impl(TensorIterator &iter) { +======= +static void argmin_kernel_impl(TensorIterator &iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmin_cpu", [&] { if (is_reduce_lastdim(iter)) { using arg_t = std::pair; @@ -425,6 +484,7 @@ void argmin_kernel_impl(TensorIterator &iter) { }); } +<<<<<<< HEAD template struct XorSumOps { inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { @@ -468,6 +528,8 @@ void xor_sum_kernel_impl(TensorIterator& iter) { }); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // anonymous namespace REGISTER_DISPATCH(std_var_stub, &std_var_kernel_impl) @@ -482,7 +544,10 @@ REGISTER_DISPATCH(min_values_stub, &min_values_kernel_impl) REGISTER_DISPATCH(max_values_stub, &max_values_kernel_impl) REGISTER_DISPATCH(argmax_stub, &argmax_kernel_impl) REGISTER_DISPATCH(argmin_stub, &argmin_kernel_impl) +<<<<<<< HEAD REGISTER_DISPATCH(xor_sum_stub, &xor_sum_kernel_impl) +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) REGISTER_DISPATCH(cumprod_stub, &cumprod_cpu_kernel) REGISTER_DISPATCH(cumsum_stub, &cumsum_cpu_kernel) diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h index 1b0be8d18db7d..199b89337bd62 100644 --- a/aten/src/ATen/native/cpu/ReduceUtils.h +++ b/aten/src/ATen/native/cpu/ReduceUtils.h @@ -8,6 +8,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::native { inline namespace CPU_CAPABILITY { diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp index 8d22201ed63c4..63419dcb6861f 100644 --- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp +++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp @@ -428,11 +428,18 @@ void fp16_gemv_trans( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0); #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) if (at::globalContext().allowFP16ReductionCPU()) { +<<<<<<< HEAD fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); return; } #endif fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); +======= + return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); + } +#endif + return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) { @@ -466,7 +473,11 @@ void bf16_gemv_trans( at::BFloat16* y, const int incy) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0); +<<<<<<< HEAD bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy); +======= + return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } float fp16_dot( diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp index 895263bc44664..78663cecf2163 100644 --- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp +++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp @@ -41,7 +41,11 @@ class ReduceMultiply { *self_data = c10::load(self_data) && c10::load(src_data); } }; +<<<<<<< HEAD ReduceMultiply reduce_multiply; +======= +static ReduceMultiply reduce_multiply; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class ReduceAdd { public: @@ -51,7 +55,11 @@ class ReduceAdd { *self_data += opmath_t(c10::load(src_data)); } }; +<<<<<<< HEAD ReduceAdd reduce_add; +======= +static ReduceAdd reduce_add; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class ReduceMean { public: @@ -61,7 +69,11 @@ class ReduceMean { *self_data += opmath_t(c10::load(src_data)); } }; +<<<<<<< HEAD ReduceMean reduce_mean; +======= +static ReduceMean reduce_mean; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class ReduceMaximum { public: @@ -73,7 +85,11 @@ class ReduceMaximum { *self_data = at::_isnan(src_value) ? opmath_t(src_value) : std::max(self_value, opmath_t(src_value)); } }; +<<<<<<< HEAD ReduceMaximum reduce_maximum; +======= +static ReduceMaximum reduce_maximum; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class ReduceMinimum { public: @@ -85,7 +101,11 @@ class ReduceMinimum { *self_data = at::_isnan(src_value) ? opmath_t(src_value) : std::min(self_value, opmath_t(src_value)); } }; +<<<<<<< HEAD ReduceMinimum reduce_minimum; +======= +static ReduceMinimum reduce_minimum; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) class TensorAssign { public: @@ -95,7 +115,11 @@ class TensorAssign { *self_data = opmath_t(c10::load(src_data)); } }; +<<<<<<< HEAD TensorAssign tensor_assign; +======= +static TensorAssign tensor_assign; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template struct _cpu_scatter_gather_dim_loop { diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 9ecfe55cedc4a..abcfab16806f1 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -7,7 +7,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include @@ -17,6 +20,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // [Note AVX-SSE transitions] In general we avoid calls into cmath for code // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in @@ -647,10 +654,17 @@ _vec_softmax( parallel_for( 0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) { int64_t idx = begin; +<<<<<<< HEAD std::vector temp_vec_input(dim_size * vectorized_step); std::vector temp_vec_output(dim_size * vectorized_step); float* temp_vec_input_data = temp_vec_input.data(); float* temp_vec_output_data = temp_vec_output.data(); +======= + std::unique_ptr temp_vec_input(new float[dim_size*vectorized_step]()); + std::unique_ptr temp_vec_output(new float[dim_size*vectorized_step]()); + float* temp_vec_input_data = temp_vec_input.get(); + float* temp_vec_output_data = temp_vec_output.get(); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) while (idx < end) { int64_t outer_idx = idx / inner_size; int64_t inner_idx = idx % inner_size; @@ -968,7 +982,11 @@ struct vec_host_softmax_backward { } }; +<<<<<<< HEAD void softmax_lastdim_kernel_impl( +======= +static void softmax_lastdim_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& result, const Tensor& self) { AT_DISPATCH_FLOATING_TYPES_AND2( @@ -977,13 +995,21 @@ void softmax_lastdim_kernel_impl( [&] { vec_host_softmax_lastdim::apply(result, self); }); } +<<<<<<< HEAD void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { +======= +static void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "softmax_kernel_impl", [&] { vec_softmax::apply(result, self, dim); }); } +<<<<<<< HEAD void log_softmax_lastdim_kernel_impl( +======= +static void log_softmax_lastdim_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& result, const Tensor& self) { AT_DISPATCH_FLOATING_TYPES_AND2( @@ -992,13 +1018,21 @@ void log_softmax_lastdim_kernel_impl( [&] { vec_host_softmax_lastdim::apply(result, self); }); } +<<<<<<< HEAD void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { +======= +static void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "softmax_kernel_impl", [&] { vec_softmax::apply(result, self, dim); }); } +<<<<<<< HEAD void softmax_backward_lastdim_kernel_impl( +======= +static void softmax_backward_lastdim_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_input, const Tensor& grad, const Tensor& output) { @@ -1010,7 +1044,11 @@ void softmax_backward_lastdim_kernel_impl( }); } +<<<<<<< HEAD void log_softmax_backward_lastdim_kernel_impl( +======= +static void log_softmax_backward_lastdim_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_input, const Tensor& grad, const Tensor& output) { @@ -1022,7 +1060,11 @@ void log_softmax_backward_lastdim_kernel_impl( }); } +<<<<<<< HEAD void softmax_backward_kernel_impl( +======= +static void softmax_backward_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_input, const Tensor& grad, const Tensor& output, @@ -1038,7 +1080,11 @@ void softmax_backward_kernel_impl( }); } +<<<<<<< HEAD void log_softmax_backward_kernel_impl( +======= +static void log_softmax_backward_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& grad_input, const Tensor& grad, const Tensor& output, diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index 7d337c119c983..22ff180e1333a 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -90,7 +90,11 @@ struct KeyValueCompDesc { }; #ifdef USE_FBGEMM +<<<<<<< HEAD bool can_use_radix_sort(const TensorBase& values, const bool descending) { +======= +static bool can_use_radix_sort(const TensorBase& values, const bool descending) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // radix_sort can be used only for 1D data if (values.dim() != 1) return false; // radix_sort sorts in ascending order @@ -106,7 +110,11 @@ bool can_use_radix_sort(const TensorBase& values, const bool descending) { return true; } +<<<<<<< HEAD void parallel_sort1d_kernel( +======= +static void parallel_sort1d_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const TensorBase& values, const TensorBase& indices) { AT_DISPATCH_INTEGRAL_TYPES(values.scalar_type(), "parallel_sort1d_kernel", [&] { @@ -140,7 +148,11 @@ void parallel_sort1d_kernel( #endif template +<<<<<<< HEAD inline void sort_kernel_impl(const value_accessor_t& value_accessor, +======= +static inline void sort_kernel_impl(const value_accessor_t& value_accessor, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const indices_accessor_t& indices_accessor, int64_t dim_size, bool descending, bool stable) { auto composite_accessor = CompositeRandomAccessorCPU< @@ -165,7 +177,11 @@ inline void sort_kernel_impl(const value_accessor_t& value_accessor, } } +<<<<<<< HEAD void sort_kernel( +======= +static void sort_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const TensorBase& self, const TensorBase& values, const TensorBase& indices, @@ -222,7 +238,11 @@ void sort_kernel( ); } +<<<<<<< HEAD void topk_kernel( +======= +static void topk_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const TensorBase &values, const TensorBase &indices, const TensorBase &self, diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp index 0fda4ae05f3e0..f6a86ac9484d1 100644 --- a/aten/src/ATen/native/cpu/SumKernel.cpp +++ b/aten/src/ATen/native/cpu/SumKernel.cpp @@ -286,12 +286,20 @@ struct CastStoreAccumulate { }; template +<<<<<<< HEAD void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) { +======= +static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) StorePolicy::store(data, stride, index, value); } template +<<<<<<< HEAD void store(char * C10_RESTRICT data, int64_t stride, int64_t index, +======= +static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const std::array &values) { auto *base_ptr = data + stride * index; for (const auto k : c10::irange(numel)) { @@ -301,7 +309,11 @@ void store(char * C10_RESTRICT data, int64_t stride, int64_t index, } template +<<<<<<< HEAD void store(char * C10_RESTRICT data, int64_t stride, int64_t index, +======= +static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Vectorized &values) { using vec_t = Vectorized; alignas(64) std::array array_values{}; diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index c479e1610cbeb..ad4193e69926e 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -29,7 +29,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD inline void compare_base_kernel_core( +======= +static inline void compare_base_kernel_core( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& result1, const Tensor& result2, const Tensor& self, @@ -71,7 +75,11 @@ inline void compare_base_kernel_core( } template +<<<<<<< HEAD inline void compare_base_kernel(const Tensor& result1, const Tensor& result2, +======= +static inline void compare_base_kernel(const Tensor& result1, const Tensor& result2, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim, bool keepdim, @@ -98,7 +106,11 @@ inline void compare_base_kernel(const Tensor& result1, const Tensor& result2, result1, result2, self, dim, keepdim, loop); } +<<<<<<< HEAD void min_kernel_impl( +======= +static void min_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& result, const Tensor& indice, const Tensor& self, @@ -131,7 +143,11 @@ void min_kernel_impl( }); } +<<<<<<< HEAD void max_kernel_impl( +======= +static void max_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& result, const Tensor& indice, const Tensor& self, @@ -164,7 +180,11 @@ void max_kernel_impl( }); } +<<<<<<< HEAD void aminmax_kernel( +======= +static void aminmax_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& self, int64_t dim, bool keepdim, @@ -212,7 +232,11 @@ void aminmax_kernel( }); } +<<<<<<< HEAD void where_kernel_impl(TensorIterator &iter) { +======= +static void where_kernel_impl(TensorIterator &iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_V2( iter.dtype(), "where_cpu", [&] { cpu_kernel( @@ -224,19 +248,31 @@ void where_kernel_impl(TensorIterator &iter) { kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_FLOAT8_TYPES)); } +<<<<<<< HEAD void isposinf_kernel_impl(TensorIteratorBase& iter) { +======= +static void isposinf_kernel_impl(TensorIteratorBase& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_cpu", [&]() { cpu_kernel(iter, [](scalar_t a) -> bool { return a == std::numeric_limits::infinity(); }); }); } +<<<<<<< HEAD void isneginf_kernel_impl(TensorIteratorBase& iter) { +======= +static void isneginf_kernel_impl(TensorIteratorBase& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_cpu", [&]() { cpu_kernel(iter, [](scalar_t a) -> bool { return a == -std::numeric_limits::infinity(); }); }); } +<<<<<<< HEAD void mode_kernel_impl( +======= +static void mode_kernel_impl( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor& values, Tensor& indices, const Tensor& self, @@ -308,7 +344,11 @@ void mode_kernel_impl( // Default brute force implementation of isin(). Used when the number of test elements is small. // Iterates through each element and checks it against each test element. +<<<<<<< HEAD void isin_default_kernel_cpu( +======= +static void isin_default_kernel_cpu( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const Tensor& elements, const Tensor& test_elements, bool invert, @@ -339,7 +379,11 @@ void isin_default_kernel_cpu( }); } +<<<<<<< HEAD void clamp_kernel_impl(TensorIteratorBase& iter) { +======= +static void clamp_kernel_impl(TensorIteratorBase& iter) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_cpu", [&]() { cpu_kernel_vec(iter, [](scalar_t a, scalar_t min, scalar_t max) -> scalar_t { @@ -355,7 +399,11 @@ void clamp_kernel_impl(TensorIteratorBase& iter) { }); } +<<<<<<< HEAD void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) { +======= +static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_scalar_cpu", [&]() { const auto min = min_.to(); const auto max = max_.to(); @@ -371,7 +419,11 @@ void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, cons }); } +<<<<<<< HEAD void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) { +======= +static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_max_scalar_cpu", [&]() { const auto max = max_.to(); const Vectorized max_vec(max); @@ -385,7 +437,11 @@ void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) { }); } +<<<<<<< HEAD void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) { +======= +static void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_min_scalar_cpu", [&]() { const auto min = min_.to(); const Vectorized min_vec(min); diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 444ec10861da8..ff397a8e183e9 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -13,7 +13,11 @@ namespace at::native { namespace { template +<<<<<<< HEAD inline void cadd( +======= +static inline void cadd( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* z, const scalar_t* x, const scalar_t* y, @@ -34,7 +38,11 @@ inline void cadd( } template +<<<<<<< HEAD void unfolded2d_acc( +======= +static void unfolded2d_acc( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* finput_data, scalar_t* input_data, int64_t kH, @@ -113,7 +121,11 @@ void unfolded2d_acc( } template +<<<<<<< HEAD void unfolded2d_acc_channels_last( +======= +static void unfolded2d_acc_channels_last( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t* finput_data, scalar_t* input_data, int64_t kH, @@ -169,10 +181,13 @@ void unfolded2d_acc_channels_last( /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ +<<<<<<< HEAD #if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) // Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 __attribute__((optimize("no-tree-vectorize"))) #endif +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void unfolded2d_acc_kernel( ScalarType dtype, void *finput_data, @@ -225,7 +240,11 @@ void unfolded2d_acc_kernel( } template +<<<<<<< HEAD void unfolded2d_copy( +======= +static void unfolded2d_copy( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input_data, scalar_t* finput_data, int64_t kH, @@ -240,7 +259,11 @@ void unfolded2d_copy( int64_t output_height, int64_t output_width) { at::parallel_for( +<<<<<<< HEAD 0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { +======= + 0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto k : c10::irange(start, end)) { int64_t nip = k / (kH * kW); int64_t rest = k % (kH * kW); @@ -316,7 +339,11 @@ void unfolded2d_copy( for (int64_t x = 0; x < output_width; x++) memcpy( dst + (size_t)y * output_width + x, +<<<<<<< HEAD src + (size_t)iy * input_width + ix + x * dW, +======= + src + (size_t)iy * input_width + ix + (int64_t)x * dW, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) sizeof(scalar_t) * (1)); } } @@ -326,7 +353,11 @@ void unfolded2d_copy( } template +<<<<<<< HEAD void unfolded2d_copy_channels_last( +======= +static void unfolded2d_copy_channels_last( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const scalar_t* input_data, scalar_t* finput_data, int64_t kH, diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index e59e5985bf7f3..56859dca14ed8 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -157,13 +157,21 @@ struct Interpolate<1, scalar_t, opmath_t, index_t, 2> { }; template +<<<<<<< HEAD inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) { +======= +static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using opmath_t = at::opmath_type; return Interpolate::eval(src, data, strides, i); } template +<<<<<<< HEAD inline scalar_t interpolate_aa_single_dim_zero_strides( +======= +static inline scalar_t interpolate_aa_single_dim_zero_strides( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) char* src, char** data, const index_t ids_stride) { @@ -187,7 +195,11 @@ inline scalar_t interpolate_aa_single_dim_zero_strides( } template +<<<<<<< HEAD inline scalar_t interpolate_aa_single_dim( +======= +static inline scalar_t interpolate_aa_single_dim( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) char* src, char** data, const int64_t* strides, @@ -213,7 +225,11 @@ inline scalar_t interpolate_aa_single_dim( } template +<<<<<<< HEAD inline bool is_zero_stride(const int64_t* strides) { +======= +static inline bool is_zero_stride(const int64_t* strides) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool output = strides[0] == 0; for (const auto i : c10::irange(1, m)) { output &= (strides[i] == 0); @@ -222,7 +238,11 @@ inline bool is_zero_stride(const int64_t* strides) { } template +<<<<<<< HEAD inline bool is_contiguous_stride(const int64_t* strides) { +======= +static inline bool is_contiguous_stride(const int64_t* strides) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool output = (strides[0] == sizeof(index_t)) && (strides[1] == sizeof(scalar_t)); for (int i=2; i<2 * interp_size; i+=2) { output &= (strides[i] == sizeof(index_t)) && (strides[i + 1] == sizeof(scalar_t)); @@ -282,13 +302,21 @@ struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, inte }; template +<<<<<<< HEAD inline bool check_almost_all_zero_stride(const int64_t* strides) { +======= +static inline bool check_almost_all_zero_stride(const int64_t* strides) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return CheckAlmostAllZeroStrides::eval(strides); } // Helper method to compute interpolation for nearest, linear, cubic modes template +<<<<<<< HEAD inline void basic_loop(char** data, const int64_t* strides, int64_t n) { +======= +static inline void basic_loop(char** data, const int64_t* strides, int64_t n) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) char* dst = data[0]; char* src = data[1]; for (const auto i : c10::irange(n)) { @@ -298,7 +326,11 @@ inline void basic_loop(char** data, const int64_t* strides, int64_t n) { } template +<<<<<<< HEAD inline void basic_loop_aa_vertical( +======= +static inline void basic_loop_aa_vertical( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) char** data, const int64_t* strides, int64_t n, @@ -354,7 +386,11 @@ inline void basic_loop_aa_vertical( } template +<<<<<<< HEAD inline void basic_loop_aa_horizontal( +======= +static inline void basic_loop_aa_horizontal( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) char** data, const int64_t* strides, int64_t n, @@ -1038,7 +1074,11 @@ struct HelperInterpNearest : public HelperInterpBase { // We keep this structure for BC and consider as deprecated. // See HelperInterpNearestExact as replacement +<<<<<<< HEAD static constexpr int interp_size = 1; +======= + static const int interp_size = 1; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) static inline void init_indices_weights( at::ScalarType output_type, @@ -1155,7 +1195,11 @@ struct HelperInterpNearestExact : public HelperInterpNearest { struct HelperInterpLinear : public HelperInterpBase { +<<<<<<< HEAD static constexpr int interp_size = 2; +======= + static const int interp_size = 2; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Compute indices and weights for each interpolated dimension // indices_weights = { @@ -1275,7 +1319,11 @@ struct HelperInterpLinear : public HelperInterpBase { struct HelperInterpCubic : public HelperInterpBase { +<<<<<<< HEAD static constexpr int interp_size = 4; +======= + static const int interp_size = 4; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Compute indices and weights for each interpolated dimension // indices_weights = { diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h index 073cc4fd7e8bb..debaceef74b5f 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h +++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -35,7 +35,11 @@ Like PIL, Pillow is licensed under the open source HPND License namespace { +<<<<<<< HEAD inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { +======= +static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int32_t v; if (i32_aligned) { v = *(const int32_t*)ptr; @@ -45,11 +49,19 @@ inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligne return _mm_cvtsi32_si128(v); } +<<<<<<< HEAD inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned)); } inline void _write_endline_rgb_as_uint32( +======= +static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned)); +} + +static inline void _write_endline_rgb_as_uint32( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) uint8_t* C10_RESTRICT output, uint32_t data ) { @@ -889,7 +901,11 @@ void ImagingResampleHorizontalConvolution8u( _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))), _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1); +<<<<<<< HEAD // Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA +======= + // Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // RGBA: pix1 = [ // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0 diff --git a/aten/src/ATen/native/cpu/avx_mathfun.h b/aten/src/ATen/native/cpu/avx_mathfun.h index d66bb0a0ec068..a5fa02558cd01 100644 --- a/aten/src/ATen/native/cpu/avx_mathfun.h +++ b/aten/src/ATen/native/cpu/avx_mathfun.h @@ -240,7 +240,11 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002); _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI +<<<<<<< HEAD /* evaluation of 8 sines at once using AVX intrinsics +======= +/* evaluation of 8 sines at onces using AVX intrinsics +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index d013dfa0485e0..a471ff71cce07 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -318,7 +318,11 @@ batch_norm_cpu_collect_stats_channels_last_impl( // // The optimal THRESHOLD to tile was found empirically. // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead. +<<<<<<< HEAD // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization. +======= + // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization. +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // When num_threads == 1, always use Method 2 as there is no synchronization overhead. // diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp index adac022bc8a5d..f157a6a314105 100644 --- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp @@ -311,7 +311,11 @@ void GroupNormKernelImplChannelsLastInternal( const bool gamma_null = (gamma_data == nullptr); const bool beta_null = beta_data == nullptr; +<<<<<<< HEAD // NB: About algorithm chosen: +======= + // NB: About algorithm choosen: +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // // On channels last, GroupNorm has a input shape of {N, H, W, GD}, // Mean and rstd are collected per each n and g, which involves reduction diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp index a9683ba4bef3f..f45c0e450ee9d 100644 --- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp +++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp @@ -838,7 +838,11 @@ void dyn_quant_pack_4bit_weight_kernel( } } +<<<<<<< HEAD void ref_dyn_quant_matmul_4bit_channelwise_kernel( +======= +static void ref_dyn_quant_matmul_4bit_channelwise_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) size_t m, size_t n, size_t k, @@ -906,7 +910,11 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel( // Round to nearest integer const int32_t nudged_zero_point0 = lrintf(zero_point0); +<<<<<<< HEAD int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride; +======= + int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // LHS offset at the beginning of the row *((float*)(dst_ptr)) = recip_scale0; @@ -930,7 +938,11 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel( } }; +<<<<<<< HEAD // Dynamically Quantize the float32 input to 8 bit asymmetric +======= + // Dynamically Quantize the float32 input to 8 bit assymetric +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx); const size_t lhs_stride = @@ -997,7 +1009,11 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel( } } +<<<<<<< HEAD void ref_dyn_quant_matmul_4bit_groupwise_kernel( +======= +static void ref_dyn_quant_matmul_4bit_groupwise_kernel( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) size_t m, size_t n, size_t k, @@ -1048,7 +1064,11 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel( zero_point0 = (std::min)(zero_point0, qmax); const int32_t nudged_zero_point0 = lrintf(zero_point0); +<<<<<<< HEAD int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride; +======= + int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) *((float*)(dst_ptr)) = recip_scale0; dst_ptr += sizeof(float); @@ -1163,7 +1183,11 @@ void dyn_quant_matmul_4bit_kernel( const int64_t weight_packed_size = kleidiai::kai_pack_rhs_int4_size(N, K, block_size); if (weight_packed_size == packed_weights.numel()) { +<<<<<<< HEAD // KleidiAI interface internally handles the Channelwise and groupwise +======= + // KleidiAI interface intenally handles the Channelwise and groupwise +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // distinction kleidiai::kai_quant_pack_lhs_int4_mm( output, inp, packed_weights, M, N, K, block_size); diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp index 496b982619649..4d53e0c609b6e 100644 --- a/aten/src/ATen/native/cpu/int8mm_kernel.cpp +++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp @@ -100,7 +100,11 @@ inline void tinygemm_kernel( #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +<<<<<<< HEAD inline float _mm256_reduce_add_ps(__m256& v) { +======= +static inline float _mm256_reduce_add_ps(__m256& v) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) __m256 v1 = _mm256_permute2f128_ps(v, v, 0x1); v = _mm256_add_ps(v, v1); v1 = _mm256_shuffle_ps(v, v, 0x4E); @@ -367,6 +371,7 @@ void int8pack_mm_kernel_( auto* C_data = C.data_ptr(); const auto* S_data = scales.const_data_ptr(); +<<<<<<< HEAD int64_t M = A.size(0); int64_t N = B.size(0); int64_t K = A.size(1); @@ -379,15 +384,36 @@ void int8pack_mm_kernel_( at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) { int64_t mb{0}, nb{0}; +======= + int M = A.size(0); + int N = B.size(0); + int K = A.size(1); + int lda = A.stride(0); + constexpr int BLOCK_M = 4; + constexpr int BLOCK_N = 4; + + const int MB = (M + BLOCK_M - 1) / BLOCK_M; + const int NB = (N + BLOCK_N - 1) / BLOCK_N; + + at::parallel_for(0, MB * NB, 0, [&](int begin, int end) { + int mb{0}, nb{0}; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) data_index_init(begin, mb, MB, nb, NB); for (const auto i : c10::irange(begin, end)) { (void)i; +<<<<<<< HEAD int64_t mb_start = mb * BLOCK_M; int64_t mb_size = std::min(BLOCK_M, M - mb_start); int64_t nb_start = nb * BLOCK_N; int64_t nb_size = std::min(BLOCK_N, N - nb_start); +======= + int mb_start = mb * BLOCK_M; + int mb_size = std::min(BLOCK_M, M - mb_start); + int nb_start = nb * BLOCK_N; + int nb_size = std::min(BLOCK_N, N - nb_start); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) const auto* A_ptr = A_data + mb_start * lda; const auto* B_ptr = B_data + nb_start * K; diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h index 8aba425e89637..d52bc276fbbff 100644 --- a/aten/src/ATen/native/cpu/moments_utils.h +++ b/aten/src/ATen/native/cpu/moments_utils.h @@ -8,6 +8,10 @@ #include #include #include +<<<<<<< HEAD +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include namespace at::native { @@ -117,11 +121,17 @@ std::pair, opmath_t> RowwiseMomentsImpl(const T* X, int64_t N, in using Vec = vec::Vectorized; const Vec kZeroVec(math_t(0)); +<<<<<<< HEAD std::array m0_stk = {{0}}; std::array m1_stk; m1_stk.fill(kZeroVec); std::array m2_stk; m2_stk.fill(kZeroVec); +======= + c10::SmallVector m0_stk(depth, 0); + c10::SmallVector m1_stk(depth, kZeroVec); + c10::SmallVector m2_stk(depth, kZeroVec); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) for (const auto i : c10::irange(m)) { const T* X_ptr = X + i * kChunkSize * kVecSize; diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h index 827c69629eb37..968835d1874d2 100644 --- a/aten/src/ATen/native/cpu/utils.h +++ b/aten/src/ATen/native/cpu/utils.h @@ -6,9 +6,13 @@ #include #ifdef USE_FBGEMM +<<<<<<< HEAD C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") #include C10_DIAGNOSTIC_POP() +======= +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif namespace at::native { @@ -167,12 +171,15 @@ inline void transpose(int64_t M, int64_t N, const uint16_t* src, int64 TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); } +<<<<<<< HEAD template <> inline void transpose(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) { TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); } +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif template diff --git a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu index fcacef37ceaf0..8ca84aab0d54f 100644 --- a/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu +++ b/aten/src/ATen/native/cuda/ActivationHardsigmoidKernel.cu @@ -36,7 +36,11 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) { [zero, one_sixth, three, six] GPU_LAMBDA( scalar_t self_val) -> scalar_t { opmath_t x = static_cast(self_val); +<<<<<<< HEAD return std::min(std::max(x + three, zero), six) * one_sixth; +======= + return std::min(std::max(x + three, zero), six) * one_sixth; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); }); } diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu index 47c705a667b52..2ac0cbed4d2d6 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu @@ -526,7 +526,11 @@ namespace { // we are dealing with packed tensor here. max index is the same as numel. +<<<<<<< HEAD // TODO: to really support input tensor large enough to go beyond int32, +======= + // TODO: to really support input tensor large enought to go beyond int32, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // we will need to restrict out shared memory usage and adjust the launch // config; AT_ASSERT(input_.numel() < std::numeric_limits::max()); @@ -681,7 +685,11 @@ namespace { const dim3 grid(grid_x, grid_y, grid_z); // we are dealing with packed tensor here. max index is the same as numel. +<<<<<<< HEAD // TODO: to really support input tensor large enough to go beyond int32, +======= + // TODO: to really support input tensor large enought to go beyond int32, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // we will need to restrict out shared memory usage and adjust the launch // config; AT_ASSERT(input.numel() < std::numeric_limits::max()); diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu index d9a0b0059917f..265b74036e321 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu @@ -53,7 +53,11 @@ __global__ void adaptiveaveragepool( const scalar_t *input, scalar_t *output, int isizeT, int isizeH, int isizeW, int osizeT, int osizeH, int osizeW, +<<<<<<< HEAD int64_t sizeD, int64_t istrideB, int64_t istrideD, +======= + int64_t istrideD, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t istrideT, int64_t istrideH, int64_t istrideW, int64_t offsetZ) { // iterates on output pixels @@ -70,17 +74,26 @@ __global__ void adaptiveaveragepool( // select output plane int64_t o_plane = blockIdx.x + offsetZ; ot = o_plane % osizeT; // output frame/time +<<<<<<< HEAD int d = o_plane / osizeT; // flattened (batch, channel) index // Decompose d into batch and channel indices int batch_idx = d / sizeD; int channel_idx = d % sizeD; +======= + int d = o_plane / osizeT; // slice/feature +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // input frame/time range is fixed. int istartT = start_index(ot, osizeT, isizeT); int iendT = end_index(ot, osizeT, isizeT); int kT = iendT - istartT; +<<<<<<< HEAD +======= + // input offset by slice/feature and earliest relevant frame/time + const scalar_t *input_dt = input + d*istrideD + istartT*istrideT; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // output offset by slice/feature and frame/time scalar_t *output_dt = output + o_plane*osizeH*osizeW; @@ -95,6 +108,11 @@ __global__ void adaptiveaveragepool( int iendW = end_index(ow, osizeW, isizeW); int kW = iendW - istartW; +<<<<<<< HEAD +======= + // Compute the average pooling from corresponding input pixels + const scalar_t *ptr_input = input_dt + istartH*istrideH + istartW*istrideW; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) scalar_t *ptr_output = output_dt + oh*osizeW + ow; accscalar_t sum = static_cast(0); @@ -102,6 +120,7 @@ __global__ void adaptiveaveragepool( for (it = 0; it < kT; ++it) { for (ih = 0; ih < kH; ++ih) { for (iw = 0; iw < kW; ++iw) { +<<<<<<< HEAD int64_t input_offset = batch_idx * istrideB + channel_idx * istrideD + (istartT + it) * istrideT + (istartH + ih) * istrideH + (istartW + iw) * istrideW; @@ -109,6 +128,13 @@ __global__ void adaptiveaveragepool( sum += static_cast(val); } } +======= + scalar_t val = ptr_input[ih*istrideH + iw*istrideW]; + sum += static_cast(val); + } + } + ptr_input += istrideT; // next input frame +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // Update output const accscalar_t divide_factor = static_cast(kT * kH * kW); @@ -123,7 +149,11 @@ void adaptiveaveragepool_loop( int64_t totalZ, int isizeT, int isizeH, int isizeW, int osizeT, int osizeH, int osizeW, +<<<<<<< HEAD int64_t sizeD, int64_t istrideB, int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) { +======= + int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t offsetZ = 0; dim3 threads(32, 8); // each H*W plane is processed by blocksH thread blocks @@ -135,7 +165,11 @@ void adaptiveaveragepool_loop( input_data, output_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, +<<<<<<< HEAD sizeD, istrideB, istrideD, +======= + istrideD, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) istrideT, istrideH, istrideW, offsetZ); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -366,7 +400,11 @@ void adaptive_avg_pool3d_out_cuda_template( int64_t osizeW = output_size[2]; int64_t sizeD, isizeT, isizeH, isizeW; +<<<<<<< HEAD int64_t istrideB, istrideD, istrideT, istrideH, istrideW; +======= + int64_t istrideD, istrideT, istrideH, istrideW; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t totalZ; const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous(); @@ -377,7 +415,10 @@ void adaptive_avg_pool3d_out_cuda_template( isizeH = input.size(2); isizeW = input.size(3); +<<<<<<< HEAD istrideB = 0; +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) istrideD = input.stride(0); istrideT = input.stride(1); istrideH = input.stride(2); @@ -393,7 +434,10 @@ void adaptive_avg_pool3d_out_cuda_template( isizeH = input.size(3); isizeW = input.size(4); +<<<<<<< HEAD istrideB = input.stride(0); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) istrideD = input.stride(1); istrideT = input.stride(2); istrideH = input.stride(3); @@ -419,7 +463,11 @@ void adaptive_avg_pool3d_out_cuda_template( totalZ, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, +<<<<<<< HEAD sizeD, istrideB, istrideD, istrideT, istrideH, istrideW); +======= + istrideD, istrideT, istrideH, istrideW); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) }); } diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index f29be23acd559..46a3684a21974 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -4,7 +4,10 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -13,11 +16,15 @@ #include #include #include +<<<<<<< HEAD #include +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include #include +<<<<<<< HEAD #include #include #include @@ -28,6 +35,11 @@ #ifdef USE_FBGEMM_GENAI #include #endif +======= +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -58,8 +70,165 @@ namespace at::native { +<<<<<<< HEAD using at::blas::ScalingType; using at::blas::SwizzleType; +======= +namespace { + +// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492 +c10::MaybeOwned inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) { + if (resolve_conj && tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor); + } + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, !transpose_result); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, transpose_result); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, true); + } + + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, true); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, true); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + + +/** + * @brief Prepares matrices for CUBLAS operation + * + * This constructor prepares tensors for CUBLAS + * The main difference is that PyTorch uses row-major as the default and + * CUBLAS expects column-major. + * + * @details + * To enable row-major output while using CUBLAS, + * we use the mathematical identity that (A × B)^T = B^T × A^T. + * + * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major) + * T = row-major, N = col-major + * + * Example: + * For matrices A (M×K)(row-major) and B (K×N)(row-major): + * - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major) + * - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N) + * - However, since the output form cublas is column-major this is + * - equivalent to an output of size MxN row-major as expected + * + * The transpose flags are derived from the layouts of the passed in tensors + * + * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted + * to their unpacked values to match what cuBLAS expects. + * + * @param mat1 First input matrix + * @param mat2 Second input matrix + * @param c Output matrix (result) + * @param scale_a Optional scaling factor for first matrix + * @param scale_b Optional scaling factor for second matrix + * @param scale_result Optional scaling factor for result + */ +struct cublasCommonArgs { + cublasCommonArgs( + const Tensor& mat1, + const Tensor& mat2, + Tensor& c, + const std::optional& scale_a = std::nullopt, + const std::optional& scale_b = std::nullopt, + const std::optional& scale_result = std::nullopt) { + bool transpose_result = false, transpose_a = false, transpose_b = false; + result = prepare_matrix_for_cublas(c, transpose_result); + mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); + matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); + + // Handle scale tensors if provided + if (scale_a && scale_b) { + // By default since we return in row-major we run the gemm + // as B.T @ A.T, check transpose_result to determine if we flip the scales + scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); + scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); + scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); + scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); + } + + if (scale_result) { + scale_result_ptr = scale_result->data_ptr(); + scale_result_dtype = scale_result->scalar_type(); + } + + // Update transpose flags + if (transpose_result) { + transpose_a = !transpose_a; + transpose_b = !transpose_b; + } + + auto sizes_a = mata->sizes(); + auto sizes_b = matb->sizes(); + + m = sizes_a[transpose_result ? 1 : 0]; + k = sizes_a[transpose_result ? 0 : 1]; + n = sizes_b[transpose_result ? 0 : 1]; + lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); + ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); + result_ld = result->stride(transpose_result ? 0 : 1); + transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; + transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; + + // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing + // if the gemm operands are in packed float4 + if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) { + k = k * 2; + lda = lda * 2; + ldb = ldb * 2; + } + } + + // Matrix members + char transa, transb; + int64_t m, n, k; + int64_t lda, ldb, result_ld; + c10::MaybeOwned mata, matb, result; + + // Scale members + void* scale_mata_ptr = nullptr; + void* scale_matb_ptr = nullptr; + void* scale_result_ptr = nullptr; + std::optional scale_mata_dtype; + std::optional scale_matb_dtype; + std::optional scale_result_dtype; +}; +} // namespace +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) c10::MaybeOwned prepare_batch_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, int64_t& ld_tensor, bool transpose_result, int64_t m, int64_t n) { IntArrayRef tensor_strides = tensor.strides(); @@ -114,6 +283,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa } } +<<<<<<< HEAD /* * Checks whether DISABLE_ADDMM_CUDA_LT is set. * Additionally, for ROCM we test whether the architecture supports the Lt. @@ -221,6 +391,36 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const template void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) { +======= +static bool getDisableAddmmCudaLt() { + static const auto env_value = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT"); + if (env_value == "1") { + return true; + } + return false; +} + +#ifdef USE_ROCM +static bool isSupportedHipLtROCmArch(int index) { + static const std::vector archs = { + "gfx90a", "gfx942", +#if ROCM_VERSION >= 60300 + "gfx1100", "gfx1101", "gfx1200", "gfx1201", +#endif +#if ROCM_VERSION >= 60402 + "gfx1150", "gfx1151", +#endif +#if ROCM_VERSION >= 60500 + "gfx950" +#endif + }; + return at::detail::getCUDAHooks().isGPUArch(archs, index); +} +#endif + +template +static void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) bool transa_ = ((args.transa != 'n') && (args.transa != 'N')); bool transb_ = ((args.transb != 'n') && (args.transb != 'N')); at::cuda::tunable::GemmAndBiasParams params; @@ -259,6 +459,7 @@ void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const } } +<<<<<<< HEAD template bool launchGemmAndBiasCublasLt( // args contains result which is modified @@ -323,6 +524,9 @@ bool launchGemmCublas( Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) { // Shape checks { +======= +Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Make sure to keep addmm_cuda below in sync with this code; it // preflights a check to try to avoid actually needing to call // expand(). @@ -332,6 +536,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() ) +<<<<<<< HEAD if (result.is_same(self)) { TORCH_CHECK(result.dim() == 2, "tensors must be 2-D"); TORCH_CHECK(self.sizes()[0] == mat1.sizes()[0], "self dim 0 must match mat1 dim 0"); @@ -339,10 +544,13 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma } // } Shape checks +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // NOLINTNEXTLINE(*c-array*) TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}}; checkAllSameGPU(__func__, targs); +<<<<<<< HEAD // Handle whether to use the Lt interface { static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()); // if lt path fails, we recurse back into this function here and force the lt path to off @@ -388,6 +596,103 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma // Short circuit if the reduction dim is empty if (mat1.sizes()[1] == 0) { +======= + IntArrayRef mat1_sizes = mat1.sizes(); + IntArrayRef mat2_sizes = mat2.sizes(); + IntArrayRef self__sizes; + bool useLtInterface = false; +#if defined(USE_ROCM) + // When hipBLASLt is not supported on the architecture, + // disable_addmm_cuda_lt will always be to set to true + static bool disable_addmm_cuda_lt = + !isSupportedHipLtROCmArch(self.device().index()) || getDisableAddmmCudaLt(); +#else + static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt(); +#endif + // if lt path fails, we recurse back into this function here and force the lt path to off + // we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent + bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override; +#if defined(USE_ROCM) && ROCM_VERSION == 60400 + // hipblaslt TT fp32 regression on ROCm 6.4, cannot use + cublasCommonArgs _args(mat1, mat2, result); + if (_args.transa == 't' && _args.transb == 't') { + disable_addmm_cuda_lt_final = true; + } +#endif + at::ScalarType scalar_type = mat1.scalar_type(); + bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float; + c10::MaybeOwned self_; + if (&result != &self) { +#if defined(CUDA_VERSION) || defined(USE_ROCM) + // Strangely, if mat2 has only 1 row or column, we get + // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. + // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] + // is to use lt interface only when self is bias. + // for cuda 11.4, cublasLtMatmul is activated + // the last two conditions is to skip 16b transA and non-trans-B having + // leading dim >> rows when they are sliced from a large tensor + // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul + if (!disable_addmm_cuda_lt_final) { + useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 && + result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] && + self.is_contiguous() && result.is_contiguous() && +#ifdef USE_ROCM + (scalar_type == at::ScalarType::Float || + scalar_type == at::ScalarType::Half || + scalar_type == at::ScalarType::BFloat16) && +#else + (scalar_type == at::ScalarType::Double || + scalar_type == at::ScalarType::Float || + scalar_type == at::ScalarType::Half || + scalar_type == at::ScalarType::BFloat16) && +#endif +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM)) + mat2_sizes[0] > 1 && mat2_sizes[1] > 1; +#else + mat2_sizes[0] > 1 && mat2_sizes[1] > 1 && + mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 && + mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 && + // avoid leading dim >> rows bugs + ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) || + (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) || + (scalar_type != at::ScalarType::Half && + scalar_type != at::ScalarType::BFloat16)) && + ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) || + (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) || + (scalar_type != at::ScalarType::Half && + scalar_type != at::ScalarType::BFloat16)); +#endif + } +#endif + if (!useLtInterface) { + self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); + } + self__sizes = self_->sizes(); + } else { + self_ = c10::MaybeOwned::borrowed(self); + self__sizes = self_->sizes(); + TORCH_CHECK(result.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK(self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0"); + TORCH_CHECK(self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1"); + } + + if (&result != &self) { + at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]}); + if (beta.toComplexDouble() != 0.0 && !useLtInterface) { + at::native::copy_(result, *self_); + } + } + + + IntArrayRef result_sizes = result.sizes(); + if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) { + return result; + } + + cublasCommonArgs args(mat1, mat2, result); + + if (mat1.numel() == 0) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // By definition, when beta==0, values in self should be ignored. nans and infs // should not propagate if (beta.toComplexDouble() == 0.) { @@ -399,6 +704,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma result, self.expand(result.sizes()), at::native::scalar_tensor( +<<<<<<< HEAD beta, self.scalar_type(), std::nullopt /* layout */, @@ -431,12 +737,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma #endif } else { // !is_float_output_with_half_input +======= + beta, + self.scalar_type(), + std::nullopt /* layout */, + at::kCPU, + std::nullopt /* pin_memory */)); + } + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); + + if (useLtInterface) { +#if defined(USE_ROCM) + bool okay = true; + if (is_float_output_with_half_input) { + TORCH_CHECK(false, "float output with half input is not enabled for ROCm"); + } else { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda_lt", [&] { +<<<<<<< HEAD lt_success = launchGemmAndBiasCublasLt(args, self, alpha, activation); } ); @@ -449,14 +773,145 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma // end Lt path } else { // No Lt, we use a GEMM instead +======= + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + launchTunableGemmAndBias( + args, + alpha, + (&result != &self) ? self.const_data_ptr() : nullptr, + activation_to_gemm_and_blas_arg(activation)); + } else { + okay = at::cuda::blas::gemm_and_bias( + args.transa == 't', + args.transb == 't', + args.m, + args.n, + args.k, + alpha.to>(), + args.mata->const_data_ptr(), + args.lda, + args.matb->const_data_ptr(), + args.ldb, + // This condition is needed for mm case on ROCm for hipblasLt path. + // Passing the bias ptr as null to avoid accuracy issues for mm case. + (&result != &self) ? self.const_data_ptr() : nullptr, + args.result->data_ptr(), + args.result_ld, + activation_to_gemm_and_blas_arg(activation) + ); + } + }); + } + if (!okay) { + // lt path failed; recurse but disable lt path + return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true); + } +#else + auto activation_epilogue = activation_to_gemm_and_blas_arg(activation); + bool okay = true; + if (is_float_output_with_half_input) { + AT_DISPATCH_REDUCED_FLOATING_TYPES( + scalar_type, + "addmm_cuda_lt", + [&] { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input"); + } + else { + okay = at::cuda::blas::gemm_and_bias( + args.transa == 't', + args.transb == 't', + args.m, + args.n, + args.k, + alpha.to>(), + args.mata->const_data_ptr(), + args.lda, + args.matb->const_data_ptr(), + args.ldb, + self.const_data_ptr(), + args.result->data_ptr(), + args.result_ld, + activation_epilogue + ); + }}); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + scalar_type, + "addmm_cuda_lt", + [&] { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + launchTunableGemmAndBias( + args, + alpha, + self.const_data_ptr(), + activation_epilogue); + } + else { + okay = at::cuda::blas::gemm_and_bias( + args.transa == 't', + args.transb == 't', + args.m, + args.n, + args.k, + alpha.to>(), + args.mata->const_data_ptr(), + args.lda, + args.matb->const_data_ptr(), + args.ldb, + self.const_data_ptr(), + args.result->data_ptr(), + args.result_ld, + activation_epilogue + ); + }}); + } + if (!okay) { + // lt path failed; recurse but disable lt path + return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true); + } +#endif + } else + { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (is_float_output_with_half_input) { AT_DISPATCH_REDUCED_FLOATING_TYPES( scalar_type, "addmm_cuda", [&] { +<<<<<<< HEAD launchGemmCublas(args, alpha, beta); } ); +======= + using opmath_t = at::opmath_type; + opmath_t alpha_val = alpha.to(); + opmath_t beta_val = beta.to(); + const scalar_t* mat1_ptr = args.mata->const_data_ptr(); + const scalar_t* mat2_ptr = args.matb->const_data_ptr(); + + float* result_ptr = args.result->mutable_data_ptr(); + at::cuda::blas::gemm( + args.transa, + args.transb, + args.m, + args.n, + args.k, + alpha_val, + mat1_ptr, + args.lda, + mat2_ptr, + args.ldb, + beta_val, + result_ptr, + args.result_ld); + }); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } else { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( at::ScalarType::Half, @@ -464,12 +919,37 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma scalar_type, "addmm_cuda", [&] { +<<<<<<< HEAD launchGemmCublas(args, alpha, beta); } ); } // Apply epilogue +======= + using opmath_t = at::opmath_type; + opmath_t alpha_val = alpha.to(); + opmath_t beta_val = beta.to(); + const scalar_t* mat1_ptr = args.mata->const_data_ptr(); + const scalar_t* mat2_ptr = args.matb->const_data_ptr(); + scalar_t* result_ptr = args.result->mutable_data_ptr(); + at::cuda::blas::gemm( + args.transa, + args.transb, + args.m, + args.n, + args.k, + alpha_val, + mat1_ptr, + args.lda, + mat2_ptr, + args.ldb, + beta_val, + result_ptr, + args.result_ld); + }); + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) switch (activation) { case Activation::RELU: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) @@ -481,14 +961,22 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma break; default: break; } +<<<<<<< HEAD } // end GEMM path +======= + } +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // Preprocessor gate here needs to match the inverse of the check // gating activation_to_gemm_and_blas_arg above; here we are manually // performing a post-GELU because we weren't able to use the GELU // epilogue above. #if !defined(CUDA_VERSION) && !defined(USE_ROCM) +<<<<<<< HEAD if (!disable_addmm_cuda_lt && activation == Activation::GELU) { +======= + if (useLtInterface && activation == Activation::GELU) { +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) at::gelu_(const_cast(*args.result), "tanh"); } #endif @@ -893,6 +1381,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { return _int_mm_out_cuda(self, mat2, result); } +<<<<<<< HEAD static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional& self_baddbmm = std::nullopt) { // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); @@ -922,6 +1411,708 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor"); TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output"); } +======= +static bool _scaled_mm_allowed_device(bool sm90_only=false) { +#ifdef USE_ROCM + static const std::vector archs = { + "gfx942", +#if ROCM_VERSION >= 60300 + "gfx1200", "gfx1201", +#endif +#if ROCM_VERSION >= 60500 + "gfx950" +#endif + }; + return at::detail::getCUDAHooks().isGPUArch(archs); +#else + auto dprops = at::cuda::getCurrentDeviceProperties(); + if (sm90_only) { + return dprops->major == 9; + } else { + return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); + } +#endif +} + +#ifdef USE_ROCM +static bool _scaled_mm_is_fnuz() { + return at::detail::getCUDAHooks().isGPUArch({"gfx942"}); +} +#endif + +namespace{ + +enum class ScalingType : std::uint8_t { + TensorWise, + RowWise, + BlockWise, + Error +}; +/* + * Scaling Type Determination: + * --------------------------- + * Conditions and corresponding Scaling Types: + * + * - If scale tensors are both `Float8_e8m0fnu` or `Float8_e4m3fn`: + * - Returns BlockWise (with additional size checks). + * + * - If scale_a.numel() == 1 && scale_b.numel() == 1: + * - Returns TensorWise. + * + * - Else if scale_a.dim() == 2 && scale_a.size(0) == dim_m && scale_b.size(0) == dim_n: + * - Returns RowWise. + * + * - Otherwise: + * - Returns Error. + */ + +// Validates the scale tensors to scaled_mm +// And returns the type of scaling/which kernel to use +ScalingType get_scaling_type( + const at::Tensor& scale_a, + const at::Tensor& scale_b, + int64_t dim_m, + int64_t dim_k, + int64_t dim_n) { + // Check for BlockWise scaling (FP8_E8M0 and FP8_E4M3 types) + if ((scale_a.scalar_type() == scale_b.scalar_type()) && + ((scale_a.scalar_type() == at::kFloat8_e8m0fnu) || (scale_a.scalar_type() == at::kFloat8_e4m3fn))) { + const bool is_nvfp4 = scale_a.scalar_type() == at::kFloat8_e4m3fn; + + // cuBLAS's mxfp8 gemm: block_size is 1 scale per 32 elements + // cuBLAS's nvfp4 gemm: block_size is 1 scale per 16 unpacked elements. + const auto BLOCK_SIZE_K = is_nvfp4 ? 16 : 32; + + constexpr int64_t BLOCK_SIZE_MN = 128; + + // adjust for fp4x2 packing if necessary + const auto dim_k_unpacked = is_nvfp4 ? dim_k * 2 : dim_k; + + auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; }; + auto num_k_blocks = ceil_div(dim_k_unpacked, BLOCK_SIZE_K); + auto padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4; + + // TODO: We might want to enforce some structure on the shapes of the scale + // tensors + + // Check expected sizes for block-wise scaling + auto expected_a_size = + BLOCK_SIZE_MN * ceil_div(dim_m, BLOCK_SIZE_MN) * padded_num_k_blocks; + auto expected_b_size = + BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks; + + //TODO: enable the checks for ROCm +#ifndef USE_ROCM + TORCH_CHECK(scale_a.numel() == expected_a_size, + "For BlockWise scaling: Expected scale_a size to be ", + expected_a_size, " but got ", scale_a.numel()); + TORCH_CHECK(scale_b.numel() == expected_b_size, + "For BlockWise scaling: Expected scale_b size to be ", + expected_b_size, " but got ", scale_b.numel()); +#endif + + TORCH_CHECK( + scale_a.is_contiguous() && scale_b.is_contiguous(), + "For BlockWise scaling: Both scale_a and scale_b must be contiguous"); + + return ScalingType::BlockWise; + } + // Both Per-Tensor and Row-wise scaling expect fp32 tensors + TORCH_CHECK( + scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat, + "Both scale_a and scale_b must be float (fp32) tensors."); + + // Check the singluar scale case for per-tensor scaling + if (scale_a.numel() == 1 && scale_b.numel() == 1) { + return ScalingType::TensorWise; + } + + // For non-TensorWise scaling, enforce 2D input tensors + TORCH_CHECK( + scale_a.dim() == 2 && scale_b.dim() == 2, + "For non-TensorWise scaling, scale tensors must be 2-dimensional, " + "but got scale_a.dim()=", + scale_a.dim(), + " and scale_b.dim()=", + scale_b.dim()); + + // Check for RowWise scaling + if (scale_a.size(0) == dim_m && scale_a.size(1) == 1 && + scale_b.size(0) == 1 && scale_b.size(1) == dim_n) { +#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || \ + (defined(USE_ROCM) && (defined(HIPBLASLT_VEC_EXT) || defined(HIPBLASLT_OUTER_VEC))) + TORCH_CHECK( + scale_a.is_contiguous() && scale_b.is_contiguous(), + "Both scale_a and scale_b must be contiguous for RowWise scaling."); + return ScalingType::RowWise; +#else + TORCH_CHECK(false, "Per-row scaling is not supported for this platform!"); + return ScalingType::Error; +#endif + } + + // If we reach here, the input doesn't match any valid scaling type + TORCH_CHECK( + false, + "Invalid scaling configuration. For TensorWise scaling, both scales should be scalar. " + "For RowWise scaling, scale_a should be (", + dim_m, + ", 1) and scale_b should be (1, ", + dim_n, + "). " + "Got scale_a.size()=(", + scale_a.size(0), + ", ", + scale_a.size(1), + ") and ", + "scale_b.size()=(", + scale_b.size(0), + ", ", + scale_b.size(1), + ")"); + + return ScalingType::Error; +} + +} // namespace + + +// Computes matrix multiply + bias while applying scaling to input and output matrices +// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default. +// If output matrix type is 16 or 32-bit type, scale_result is not applied. +// Known limitations: +// - Only works if mat1 is row-major and mat2 is column-major +// - Only works if matrices sizes are divisible by 32 +// - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0) +// and scale_b should have size = to mat2.size(1) +// Arguments: +// - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16` +// - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type +// - `scale_a`: a scalar or 1-dimensional tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type +// - `scale_b`: a scalar or 1-dimensional tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type +// - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type +// - `use_fast_accum`: if true, enables fast float8 accumulation +// - `out`: a reference to the output tensor + +Tensor& +_scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum, + Tensor& out) { + // Check sizes + bool allowed_device = _scaled_mm_allowed_device(); + TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + TORCH_CHECK( + mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", + mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + + // Check what type of scaling we are doing based on inputs + ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat1.size(1), mat2.size(1)); + TORCH_INTERNAL_ASSERT(scaling_choice != ScalingType::Error, "Scaling type not supported"); + + TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat), + "scale_result must be a float scalar"); + TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1], + " but got ", bias->numel()); + TORCH_CHECK( + mat1.sizes()[1] % 16 == 0, + "Expected trailing dimension of mat1 to be divisible by 16 ", + "but got mat1 shape: (", + mat1.sizes()[0], + "x", + mat1.sizes()[1], + ")."); + TORCH_CHECK(mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, "mat2 shape (", mat2.sizes()[0], "x", + mat2.sizes()[1], ") must be divisible by 16"); + // Check types + TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type"); + TORCH_CHECK(isFloat8Type(mat1.scalar_type()) || mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat1 to be Float8 or Float4_x2 matrix got ", mat1.scalar_type()); + TORCH_CHECK(isFloat8Type(mat2.scalar_type()) || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat2 to be Float8 or Float4_x2 matrix got ", mat2.scalar_type()); +#ifndef USE_ROCM + // Type restrictions imposed by CuBLASLt as of CUDA-12.1 + TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2, + "Multiplication of two Float8_e5m2 matrices is not supported"); +#endif +#ifdef USE_ROCM + if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) { + TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above"); + } + if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) { + TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above"); + } +#endif + if (use_fast_accum) { + TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype."); + } +#ifdef USE_ROCM + if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) { + TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above"); + } + if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) { + TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e5m2 is only supported for ROCm 7.0 and above"); + } + if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) { + TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e4m3fn is only supported for ROCm 7.0 and above"); + } +#endif + if (bias) { + TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32"); + TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half, + "Bias must be either Half or BFloat16, but got ", bias->scalar_type()); + TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) || + bias->scalar_type() == ScalarType::BFloat16, + "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type()); + TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half, + "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type()); + } + { + auto bias_ = bias.value_or(Tensor()); + auto scale_result_ = scale_result.value_or(Tensor()); + + // NOLINTNEXTLINE(*c-array*) + TensorArg targs[]{{out, "out", 0}, {mat1, "mat1", 1}, {mat2, "mat2", 2}, + {bias_, "bias", 3}, {scale_a, "scale_a", 4}, {scale_b, "scale_b", 5}, + {scale_result_, "scale_result", 6}}; + checkAllSameGPU(__func__, targs); + } + // Validation checks have passed lets resize the output to actual size + IntArrayRef mat1_sizes = mat1.sizes(); + IntArrayRef mat2_sizes = mat2.sizes(); + at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); + + // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels + // do not support this case). + if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) { + // `out` was created with `at::empty`. In the case where we are multiplying + // MxK by KxN and K is the zero dim, we need to initialize here to properly + // return a tensor of zeros. + if (mat1_sizes[1] == 0) { + out.zero_(); + } + + return out; + } + + // ROCm's hipblaslt supports rowwise, so skip this check that sends this to cutlass. +#ifndef USE_ROCM + // We are doing row-wise scaling + if (scaling_choice == ScalingType::RowWise) { + TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling."); + at::cuda::detail::f8f8bf16_rowwise( + mat1, + mat2, + scale_a, + scale_b, + bias, + use_fast_accum, + out); + return out; + } +#else + if (scaling_choice == ScalingType::RowWise) { + // For ROCm, match behavior of f8f8bf16_rowwise type checking + Tensor b = mat2; + if (_scaled_mm_is_fnuz()) { + TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz); + } + else { + TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn); + } + // Until more than bf16 is supported + TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16, + "hipblaslt rowwise _scaled_mm only supports BFloat16 output"); + } + else if (scaling_choice == ScalingType::BlockWise) { +#if ROCM_VERSION >= 70000 + TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}, 0), + "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); + + TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 && + mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0, + "Matrix dimensions must be multiples of 32 for block-wise scaling"); + + TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 || + out.scalar_type() == ScalarType::Half, + "Block-wise scaling only supports BFloat16 or Half output types"); +#else + TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); +#endif + } +#endif + + cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result); + const auto out_dtype_ = args.result->scalar_type(); + TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); + +#ifdef USE_ROCM + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { +#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B) \ + if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fn, at::Float8_e5m2, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e5m2) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2, at::Float8_e4m3fn, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2, at::Float8_e5m2, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } + AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] { + bool transa_ = ((args.transa != 'n') && (args.transa != 'N')); + bool transb_ = ((args.transb != 'n') && (args.transb != 'N')); + at::cuda::tunable::ScaledGemmParams params; + params.transa = args.transa; + params.transb = args.transb; + params.m = args.m; + params.n = args.n; + params.k = args.k; + params.a = args.mata->data_ptr(); + params.a_scale_ptr = args.scale_mata_ptr; + params.a_scale_dtype = scale_a.scalar_type(); + params.lda = args.lda; + params.a_dtype = args.mata->scalar_type(); + params.b = args.matb->data_ptr(); + params.b_scale_ptr = args.scale_matb_ptr; + params.b_scale_dtype = scale_b.scalar_type(); + params.ldb = args.ldb; + params.b_dtype = args.matb->scalar_type(); + params.bias_ptr = bias ? bias->data_ptr(): nullptr; + params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_; + params.c = args.result->data_ptr(); + params.c_scale_ptr = args.scale_result_ptr; + params.ldc = args.result_ld; + params.c_dtype = out_dtype_; + params.use_fast_accum = use_fast_accum; + params.use_rowwise = scaling_choice == ScalingType::RowWise; + if (transa_ && transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T) + } + else if (transa_ && !transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N) + } + else if (!transa_ && transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T) + } + else if (!transa_ && !transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N) + } + else { + TORCH_CHECK(false, "unreachable"); + } + }), + kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES)); +#undef TUNABLE_DISPATCH + } + else +#endif + { + at::cuda::blas::scaled_gemm( + args.transa, + args.transb, + args.m, + args.n, + args.k, + args.mata->data_ptr(), + args.scale_mata_ptr, + args.lda, + args.mata->scalar_type(), + args.scale_mata_dtype.value(), + args.matb->data_ptr(), + args.scale_matb_ptr, + args.ldb, + args.matb->scalar_type(), + args.scale_matb_dtype.value(), + bias ? bias->data_ptr(): nullptr, + bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_, + args.result->data_ptr(), + args.scale_result_ptr, + args.result_ld, + out_dtype_, + use_fast_accum, + scaling_choice == ScalingType::RowWise); + } + + return out; +} + +namespace { + at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a, + const Tensor& mat_b, + const std::optional& offs, + std::optional out_dtype + ) { + c10::SmallVector out_size; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d) { + if (b_is_2d) { + out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)}; + } else { + TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(0), mat_b.size(-1)}; + } + } else { + if (b_is_2d) { + // this case is not actually encountered for MoE gemms + TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(1), mat_b.size(1)}; + } else { // regular bmm + TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match"); + out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)}; + } + } + + const auto out_dtype_ = out_dtype.value_or(kBFloat16); + TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); + + // For TMA transfers, strides of output tensor have to be either + // 1, or aligned to 16 bytes. + const auto last_dim = out_size.size() - 1; + const auto alignment = 16 / c10::elementSize(out_dtype_); + const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment; + std::vector out_stride; + if (a_is_2d != b_is_2d) { + out_stride = {size_padded, 1}; + } else { + out_stride = {out_size[1] * size_padded, size_padded, 1}; + } + auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_)); + + return out; + } + + bool check_valid_strides_and_return_transposed(const Tensor& mat) { + IntArrayRef tensor_strides = mat.strides(); + IntArrayRef tensor_sizes = mat.sizes(); + int end_dim = mat.dim() - 1; + int alignment = 16 / mat.element_size(); + TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n"); + if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max(1, tensor_sizes[end_dim - 1]))) { + TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes"); + return true; + } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max(1, tensor_sizes[end_dim]))) { + TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes"); + return false; + } else { + TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes"); + } + } + + void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + if (mat.dim() == 2) { + TORCH_CHECK( + scale.dim() == 1, + "scale must be a 1D tensor, but got ", + scale.dim(), + "D, arg ", + arg_idx); + TORCH_CHECK( + scale.is_contiguous(), "scale must be contiguous for arg ", arg_idx); + TORCH_CHECK( + scale.size(0) == mat.size(dim) * scale_multiplier, + "scale must have the same length as mat for arg ", + arg_idx); + } else { + TORCH_CHECK( + scale.dim() == 2, + "scale must be a 2D tensor, but got ", + scale.dim(), + "D for arg ", + arg_idx); + TORCH_CHECK( + scale.stride(1) == 1, + "scale must be contiguous in the last dimension for arg ", + arg_idx); + TORCH_CHECK( + scale.size(0) == mat.size(0), + "scale must have the same batch dimension as mat for arg ", + arg_idx); + TORCH_CHECK( + scale.size(1) == mat.size(1 + dim), + "scale must have the same first dimension as mat for arg ", + arg_idx); + } +} + + +} + +Tensor +_scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); +} + + +Tensor +_scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, +const Tensor& scale_a, const Tensor& scale_b, +const std::optional& offs, +const std::optional& bias, +const std::optional& scale_result, +std::optional out_dtype, +bool use_fast_accum) { +#ifndef USE_ROCM + bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true); + TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0"); + + TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type()); + TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type()); + TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed"); + TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed"); + TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + TORCH_CHECK( + mat_a.size(-1) % 16 == 0, + "Expected trailing dimension of mat_a to be divisible by 16 ", + "but got mat1 shape: (", + mat_a.sizes(), + ")."); + TORCH_CHECK(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0, + "Expected mat_b shape to be divisible by 16 ", + "but got mat_b shape: (", + mat_b.sizes(), + ")."); + + + TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); + TORCH_CHECK(!scale_result.has_value(), "Scale result not supported yet"); + TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix"); + + if (offs.has_value()) { + TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + + // Both Per-Tensor and Row-wise scaling expect fp32 tensors + TORCH_CHECK( + scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat, + "Both scale_a and scale_b must be float (fp32) tensors."); + + const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1; + check_scale(mat_a, scale_a, 0 ,0, scale_multiplier); + check_scale(mat_b, scale_b, 1, 1, scale_multiplier); + + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype); + + at::cuda::detail::f8f8bf16_grouped_mm( + mat_a, + mat_b, + scale_a, + scale_b, + offs, + bias, + use_fast_accum, + out); + return out; + + + + +#else + TORCH_CHECK(false, "grouped gemm is not supported on ROCM") +#endif + +} + +Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { +#ifndef USE_ROCM + bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true); + TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0"); + + TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type()); + TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type()); + TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + + // check that the strides are valid, the fn will throw an error if not + check_valid_strides_and_return_transposed(mat_a); + check_valid_strides_and_return_transposed(mat_b); + TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d"); + + if (offs.has_value()) { + TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); + + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype); + + at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); + return out; +#else + TORCH_CHECK(false, "grouped gemm is not supported on ROCM") +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { @@ -933,7 +2124,16 @@ Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::Sca } Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) { +<<<<<<< HEAD baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true); +======= + TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); + + TORCH_CHECK(out_dtype == batch1.scalar_type() || + (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)), + "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Scalar beta(0.0); Scalar alpha(1.0); { @@ -952,7 +2152,16 @@ Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tenso } Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { +<<<<<<< HEAD baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self); +======= + TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); + + TORCH_CHECK(out_dtype == batch1.scalar_type() || + (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)), + "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) { NoNamesGuard guard; baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha); @@ -967,12 +2176,15 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy } Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) { +<<<<<<< HEAD TORCH_CHECK(self.dim() == 2, "self must be a matrix, got ", self.dim(), "-D tensor"); TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); TORCH_CHECK( self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same"); TORCH_CHECK(out_dtype == self.scalar_type() || @@ -981,7 +2193,11 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); +<<<<<<< HEAD addmm_out_cuda_impl(out, out, self, mat2, 0, 1); +======= + addmm_out_cuda_impl(const_cast(out), out, self, mat2, 0, 1); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return out; } @@ -992,6 +2208,7 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m } Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { +<<<<<<< HEAD TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type()); TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type()); TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); @@ -1000,6 +2217,8 @@ Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tens mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); TORCH_CHECK(out_dtype == self.scalar_type() || (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)), diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index c42d03b9cbf7f..da11bbfdac099 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -297,7 +297,10 @@ static inline void launch_vectorized_kernel( int vec_size = memory::can_vectorize_up_to(data); c10::DeviceIndex curDevice = -1; AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice)); +<<<<<<< HEAD // Similar check in vectorized_elementwise_kernel() as well. Both should be in sync. +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread(); #else using cpp_type = typename function_traits::result_type; @@ -436,6 +439,10 @@ static inline void launch_vectorized_templated_kernel( loader_t l, storer_t s) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); +<<<<<<< HEAD +======= + using traits = function_traits; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) / vectorized_templated_config::block_work_size(); auto stream = at::cuda::getCurrentCUDAStream(); @@ -856,6 +863,7 @@ struct type_specialized_kernel_launcher { out_calc_t output_offset_calculator, loader_t loader, storer_t storer) { +<<<<<<< HEAD constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0]; constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1]; constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2]; @@ -863,6 +871,11 @@ struct type_specialized_kernel_launcher { using cret_t = c10::impl::ScalarTypeToCPPTypeT; using carg0_t = c10::impl::ScalarTypeToCPPTypeT; using carg1_t = c10::impl::ScalarTypeToCPPTypeT; +======= + if (ret_t == rt_binary_specializations[arg_index][0] && + arg0_t == rt_binary_specializations[arg_index][1] && + arg1_t == rt_binary_specializations[arg_index][2]) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) launch_vectorized_templated_kernel< func_t, array_t, @@ -870,9 +883,18 @@ struct type_specialized_kernel_launcher { out_calc_t, loader_t, storer_t, +<<<<<<< HEAD cret_t, carg0_t, carg1_t>( +======= + decltype(c10::impl::ScalarTypeToCPPType< + rt_binary_specializations[arg_index][0]>::t), + decltype(c10::impl::ScalarTypeToCPPType< + rt_binary_specializations[arg_index][1]>::t), + decltype(c10::impl::ScalarTypeToCPPType< + rt_binary_specializations[arg_index][2]>::t)>( +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) numel, f, data, @@ -880,10 +902,79 @@ struct type_specialized_kernel_launcher { output_offset_calculator, loader, storer); +<<<<<<< HEAD } } }; +======= + } +}; + +template +struct type_specialized_broadcast_kernel_launcher { + template < + typename func_t, + typename array_t, + typename dtypes_t, + typename calc_t> + static void apply( + int64_t numel, + func_t f, + array_t data, + dtypes_t dtypes, + calc_t offset_calc) { + using traits = function_traits; + using ret_t = typename traits::result_type; + using arg0_t = typename traits::template arg<0>::type; + using arg1_t = typename traits::template arg<1>::type; + if (dtypes[0] == rt_binary_specializations[arg_index][0] && + dtypes[1] == rt_binary_specializations[arg_index][1] && + dtypes[2] == rt_binary_specializations[arg_index][2]) { + using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + auto u = c10::load(data[1] + offsets0[1]); + auto v = c10::load(data[2] + offsets0[2]); + ret_t result0 = f(c10::convert(u), c10::convert(v)); + auto u1 = c10::load(data[1] + offsets1[1]); + auto v1 = c10::load(data[2]+ offsets1[2]); + ret_t result1 = f(c10::convert(u1), c10::convert(v1)); + auto u2 = c10::load(data[1] + offsets2[1]); + auto v2 = c10::load(data[2] + offsets2[2]); + ret_t result2 = f(c10::convert(u2), c10::convert(v2)); + auto u3 = c10::load(data[1] + offsets3[1]); + auto v3 = c10::load(data[2] + offsets3[2]); + ret_t result3 = f(c10::convert(u3), c10::convert(v3)); + *(ret_cpp_t*)out0 = c10::convert(result0); + *(ret_cpp_t*)out1 = c10::convert(result1); + *(ret_cpp_t*)out2 = c10::convert(result2); + *(ret_cpp_t*)out3 = c10::convert(result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + auto u = c10::load(data[1] + offsets[1]); + auto v = c10::load(data[2] + offsets[2]); + ret_t result = f(c10::convert(u), c10::convert(v)); + *(ret_cpp_t*)out = c10::convert(result); + } + }); + } + } +}; + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // namespace #endif @@ -1002,6 +1093,35 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { } auto offset_calc = ::make_offset_calculator(iter); #ifdef USE_ROCM +<<<<<<< HEAD +======= + if (check_binary_rt_types_for_specialization(iter)) { + // constexpr to reduce the amount of kernels generated for + // broadcast elementwise with mexed dtypes and limit which functors are actually + // applied to the load and store at compile time. + using func_tuple = typename traits::ArgsTuple; + if constexpr ( + std::is_same_v && traits::arity == 2 && + check_binary_functor_types_for_specialization< + func_tuple, + float, + float, + traits::arity, + /*arg_num=*/0>::check()) { + memory::detail::static_unroll< + type_specialized_broadcast_kernel_launcher, + rt_binary_specializations.size()>::with_args( + numel, + f, + data, + dtypes, + offset_calc + ); + return; + } + } + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) constexpr int grp_sz = 128; launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { if (unrl) { diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu index 0d34bd52f211a..524578b07c90b 100644 --- a/aten/src/ATen/native/cuda/CUDAScalar.cu +++ b/aten/src/ATen/native/cuda/CUDAScalar.cu @@ -11,11 +11,31 @@ #include +<<<<<<< HEAD +======= +#if defined(USE_ROCM) +// TODO(lufang): Tensor.item() on AMD HIP is not synced in the Recsys models. +// This is just a short term workaround. Issue is tracked as FBA-388 on the AMD side. +namespace { + bool use_sync_mode() { + static const bool sync_mode = c10::utils::check_env("HIP_DOUBLE_SYNC_ON_LOCAL_SCALE_DENSE") == true; + return sync_mode; + } +} +#endif + +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace at::native { Scalar _local_scalar_dense_cuda(const Tensor& self) { Scalar r; TORCH_CHECK(self.numel() > 0, "_local_scalar_dense: Empty tensor not supported"); +<<<<<<< HEAD +======= +#if defined(USE_ROCM) + if (!use_sync_mode()){ +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_V2( self.scalar_type(), "_local_scalar_dense_cuda", AT_WRAP([&] { // Create pinned memory for the scalar value to avoid implicit @@ -32,6 +52,18 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) { at::cuda::memcpy_and_sync((void *)value.const_data_ptr(), self.const_data_ptr(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream); r = Scalar(*value.const_data_ptr()); }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +<<<<<<< HEAD +======= +#if defined(USE_ROCM) + } else { + auto cpu_self = self.cpu(); + AT_DISPATCH_V2( + self.scalar_type(), "_local_scalar_dense_hip", AT_WRAP([&] { + r = Scalar(*cpu_self.const_data_ptr()); + }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); + } +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) return r; } diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index 754582d2d9777..aec69425e4b3f 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -1,4 +1,5 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +<<<<<<< HEAD #include #include #include @@ -6,6 +7,15 @@ #include #include #include +======= +#include +#include +#include +#include +#include +#include +#include +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #include #include #include @@ -27,6 +37,7 @@ namespace at::native { +<<<<<<< HEAD namespace { // Initial pool size for CUDA events per device. @@ -45,6 +56,8 @@ at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) { } // namespace +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) void neg_kernel_cuda(TensorIteratorBase &iter); void conj_kernel_cuda(TensorIteratorBase &iter); @@ -281,6 +294,7 @@ void copy_device_to_device(TensorIterator& iter, // write-after-read dependencies on the destination side are handled, so // that no one is operating on the dst memory when we perform the copy. // src waits on dst barrier (src already waits on src) +<<<<<<< HEAD // Use event pool for better performance instead of creating new events auto dst_ready = getEventFromPool(dst_device.index()); @@ -289,6 +303,14 @@ void copy_device_to_device(TensorIterator& iter, device_guard.set_device(src_device); dst_ready->block(copy_stream); +======= + CUDAEvent dst_ready; + device_guard.set_device(dst_device); + dst_ready.record(getCurrentCUDAStream(dst_device.index())); + + device_guard.set_device(src_device); + dst_ready.block(copy_stream); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } if (memcpy_eligible) { @@ -327,11 +349,19 @@ void copy_device_to_device(TensorIterator& iter, // operate on dst's copy until the copy is complete. // Still on src_device, record stream event +<<<<<<< HEAD auto src_ready = getEventFromPool(src_device.index()); src_ready->record(copy_stream); device_guard.set_device(dst_device); src_ready->block(getCurrentCUDAStream(dst_device.index())); +======= + CUDAEvent src_ready; + src_ready.record(copy_stream); + + device_guard.set_device(dst_device); + src_ready.block(getCurrentCUDAStream(dst_device.index())); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } AT_CUDA_CHECK(cudaGetLastError()); diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index 333c21e94f18e..49e14b9b51540 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -223,7 +223,11 @@ inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bo class CuFFTConfig { public: +<<<<<<< HEAD // Only move semantics is enough for this class. Although we already use +======= + // Only move semantics is enought for this class. Although we already use +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // unique_ptr for the plan, still remove copy constructor and assignment op so // we don't accidentally copy and take perf hit. CuFFTConfig(const CuFFTConfig&) = delete; diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h index 38013137f0a40..42e5e9fee11a0 100644 --- a/aten/src/ATen/native/cuda/CuFFTUtils.h +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h @@ -38,12 +38,22 @@ static inline std::string _cudaGetErrorEnum(cufftResult error) return "CUFFT_INVALID_SIZE"; case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; +<<<<<<< HEAD case CUFFT_INVALID_DEVICE: return "CUFFT_INVALID_DEVICE"; +======= + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case CUFFT_NO_WORKSPACE: return "CUFFT_NO_WORKSPACE"; case CUFFT_NOT_IMPLEMENTED: return "CUFFT_NOT_IMPLEMENTED"; +<<<<<<< HEAD #if CUDA_VERSION <= 12090 case CUFFT_INCOMPLETE_PARAMETER_LIST: return "CUFFT_INCOMPLETE_PARAMETER_LIST"; @@ -51,6 +61,9 @@ static inline std::string _cudaGetErrorEnum(cufftResult error) return "CUFFT_PARSE_ERROR"; #endif #if !defined(USE_ROCM) && CUDA_VERSION <= 12090 +======= +#if !defined(USE_ROCM) +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) case CUFFT_LICENSE_ERROR: return "CUFFT_LICENSE_ERROR"; #endif diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index 344906a2a4df2..a51d87cf73a55 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -38,6 +38,7 @@ __device__ inline int min(int a, int b) { #define BLOCK_STRIDE_BWD 2 // increasing block_stride to lower # of blocks launched #endif +<<<<<<< HEAD template static __device__ inline index_t p_start(index_t size, int pad, int kernel, int dilation, int stride) { const auto kernel_extent = static_cast((kernel - 1) * dilation + 1); @@ -73,6 +74,14 @@ static inline bool can_use_int32_nhwc( if (height * width > int_max) return false; return true; +======= +static __device__ inline int p_start(int size, int pad, int kernel, int dilation, int stride) { + return (size + pad < ((kernel - 1) * dilation + 1)) ? 0 : (size + pad - ((kernel - 1) * dilation + 1)) / stride + 1; +} + +static __device__ inline int p_end(int size, int pad, int pooled_size, int stride) { + return min((size + pad) / stride + 1, pooled_size); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } // kernels borrowed from Caffe @@ -114,6 +123,7 @@ __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom } } +<<<<<<< HEAD template C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS) __global__ void max_pool_forward_nhwc( @@ -133,6 +143,23 @@ __global__ void max_pool_forward_nhwc( index_t *out_mask_cached = reinterpret_cast(smem_raw); scalar_t *out_cached = reinterpret_cast( out_mask_cached + kernel_size_C*blockDim.x*blockDim.y*blockDim.z); +======= +template +C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS) +__global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch, + const int64_t channels, const int64_t height, + const int64_t width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, + const int in_stride_n, const int in_stride_c, + const int in_stride_h, const int in_stride_w, + const int kernel_stride_C, const int kernel_size_C, + scalar_t* top_data, int64_t* top_mask) { + extern __shared__ int smem[]; + int *out_mask_cached = smem; + scalar_t *out_cached = reinterpret_cast(&out_mask_cached[kernel_size_C*blockDim.x*blockDim.y*blockDim.z]); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // flattening cta for pre-computation & smem initialization; int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); @@ -151,6 +178,7 @@ __global__ void max_pool_forward_nhwc( int channel_id = blockIdx.x / nbatch; int channel_offset = threadIdx.x + channel_id * blockDim.x; +<<<<<<< HEAD top_data = top_data + static_cast(batch_id) * (pooled_height * pooled_width * channels); top_mask = top_mask + static_cast(batch_id) * (pooled_height * pooled_width * channels); bottom_data = bottom_data + static_cast(batch_id) * in_stride_n; @@ -171,6 +199,28 @@ __global__ void max_pool_forward_nhwc( for (int ow = ostartW; ow < oendW; ow+=blockDim.y) { index_t wstart = static_cast(ow) * stride_w - pad_w; index_t wend = std::min(wstart + static_cast((kernel_w - 1) * dilation_w + 1), width); +======= + top_data = top_data + batch_id * pooled_height * pooled_width * channels; + top_mask = top_mask + batch_id * pooled_height * pooled_width * channels; + bottom_data = bottom_data + batch_id * in_stride_n; + + out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x]; + out_mask_cached = &out_mask_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x]; + + int oH = (pooled_height + gridDim.z-1) / gridDim.z; + int oW = (pooled_width + gridDim.y-1) / gridDim.y; + int ostartH = threadIdx.z + blockIdx.z*oH; + int oendH = ::min(ostartH+oH, pooled_height); + int ostartW = threadIdx.y + blockIdx.y*oW; + int oendW = ::min(ostartW+oW, pooled_width); + + for (int oh = ostartH; oh < oendH; oh+=blockDim.z) { + int hstart = oh * stride_h - pad_h; + int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); + for (int ow = ostartW; ow < oendW; ow+=blockDim.y) { + int wstart = ow * stride_w - pad_w; + int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) while(hstart < 0) hstart += dilation_h; while(wstart < 0) @@ -218,12 +268,21 @@ __global__ void max_pool_forward_nhwc( // Else do it Non-Prefetch... else #endif +<<<<<<< HEAD for (index_t ih = hstart; ih < hend; ih += dilation_h) { for (index_t iw = wstart; iw < wend; iw += dilation_w) { int cached_index = threadIdx.x; const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w; for (index_t c = channel_offset; c < channels; c += static_cast(blockDim.x) * kernel_stride_C) { scalar_t val = ptr_input[c * in_stride_c]; +======= + for (int ih = hstart; ih < hend; ih += dilation_h) { + for (int iw = wstart; iw < wend; iw += dilation_w) { + int cached_index = threadIdx.x; + const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w; + for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) { + scalar_t val = ptr_input[c*in_stride_c]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if ((val > out_cached[cached_index]) || at::_isnan(val)) { out_cached[cached_index] = val; out_mask_cached[cached_index] = ih * width + iw; @@ -233,6 +292,7 @@ __global__ void max_pool_forward_nhwc( } } +<<<<<<< HEAD scalar_t *ptr_output_data = top_data + (static_cast(oh) * pooled_width + ow) * channels; int64_t *ptr_output_mask = top_mask + (static_cast(oh) * pooled_width + ow) * channels; @@ -242,6 +302,17 @@ __global__ void max_pool_forward_nhwc( ptr_output_mask[c] = static_cast(out_mask_cached[cached_index]); out_cached[cached_index] = at::numeric_limits::lower_bound(); out_mask_cached[cached_index] = index_t(0); +======= + scalar_t *ptr_output_data = top_data + (oh * pooled_width + ow) * channels; + int64_t *ptr_output_mask = top_mask + (oh * pooled_width + ow) * channels; + + int cached_index = threadIdx.x; + for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) { + ptr_output_data[c] = out_cached[cached_index]; + ptr_output_mask[c] = out_mask_cached[cached_index]; + out_cached[cached_index] = at::numeric_limits::lower_bound(); + out_mask_cached[cached_index] = 0; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) cached_index += blockDim.x; } } @@ -249,7 +320,11 @@ __global__ void max_pool_forward_nhwc( } +<<<<<<< HEAD static constexpr int BLOCK_THREADS = 256; +======= +static const int BLOCK_THREADS = 256; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) template #if defined (USE_ROCM) @@ -495,11 +570,14 @@ const Tensor& indices) { maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); const dim3 block(block_x, block_y, block_z); +<<<<<<< HEAD bool use_int32 = can_use_int32_nhwc( nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, in_stride_n, in_stride_c, in_stride_h, in_stride_w); +======= +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int kernel_stride_C = ceil_div( safe_downcast(nInputPlane), block_x * 4); int kernel_size_C = ceil_div( @@ -514,6 +592,7 @@ const Tensor& indices) { ceil_div(safe_downcast(outputHeight), block_z*BLOCK_STRIDE_FWD)); const dim3 grid(grid_x, grid_y, grid_z); +<<<<<<< HEAD size_t shmem_size; size_t mask_elems = static_cast(kernel_size_C) * block_x * block_y * block_z; @@ -549,6 +628,20 @@ const Tensor& indices) { kernel_stride_C, kernel_size_C, output_data, indices_data); } +======= + size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t)); + AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); + + max_pool_forward_nhwc + <<>>( + input_data, nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + in_stride_n, in_stride_c, + in_stride_h, in_stride_w, + kernel_stride_C, kernel_size_C, + output_data, indices_data); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 9dcfea3af9435..0bf5bbd42cf68 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -494,7 +494,11 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) auto value = static_cast(rand * range + from); // reverse the bounds of curand4 from (0, 1] to [0, 1) // Note that this method is from legacy THCTensorRandom and is likely to give +<<<<<<< HEAD // you more 0-s, since, the probability of getting 1-s is higher than 0-s and +======= + // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s. // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706 auto reverse_bound_value = value == to ? from : value; diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 65b0e1441de78..5fb661a0cf38c 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -15,7 +15,13 @@ #include #include +<<<<<<< HEAD #include +======= +#if CUB_SUPPORTS_SCAN_BY_KEY() +#include +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -34,9 +40,15 @@ namespace at::native { namespace { #if defined(USE_ROCM) +<<<<<<< HEAD static constexpr int BLOCKDIMY = 16; #else static constexpr int BLOCKDIMY = 32; +======= +static const int BLOCKDIMY = 16; +#else +static const int BLOCKDIMY = 32; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #endif template @@ -238,6 +250,13 @@ __global__ void renorm_kernel( } // anonymous namespace +<<<<<<< HEAD +======= +#if !CUB_SUPPORTS_SCAN_BY_KEY() +template +void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_, int64_t num_weights, int64_t padding_idx, @@ -300,6 +319,10 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice if (scale_grad_by_freq) { count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); +<<<<<<< HEAD +======= +#if CUB_SUPPORTS_SCAN_BY_KEY() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -310,7 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice auto count_data = count.mutable_data_ptr(); cuda::cub::inclusive_sum_by_key( sorted_data, +<<<<<<< HEAD ATEN_CUB_CONSTANT_ITERATOR(index_t)(1), +======= + NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::ConstantInputIterator(1), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) count_data, num_indices ); @@ -322,10 +349,22 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice thrust::make_reverse_iterator(sorted_data + num_indices), thrust::make_reverse_iterator(static_cast(count_data) + num_indices), thrust::make_reverse_iterator(count_data + num_indices), +<<<<<<< HEAD ATEN_CUB_MAXIMUM(), num_indices ); }); +======= + NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::Max(), + num_indices + ); + }); +#else + AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { + embedding_dense_backward_cuda_scan(sorted_indices, count); + }); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return embedding_backward_cuda_kernel(grad, orig_indices, @@ -357,7 +396,11 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, int warp_size = at::cuda::warp_size(); TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 && +<<<<<<< HEAD num_threads() <= static_cast(cuda_utils::kCUDABlockReduceMaxThreads()), +======= + num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) "BlockReduceSum requires all warps be active"); const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr(); dim3 grid = unique_indices.numel(); diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu index 6ce419137345f..ee8253de43456 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu @@ -10,7 +10,13 @@ #include +<<<<<<< HEAD #include +======= +#if CUB_SUPPORTS_UNIQUE_BY_KEY() +#include +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) #ifndef AT_PER_OPERATOR_HEADERS #include @@ -86,9 +92,15 @@ __global__ void compute_grad_weight_bags( const int64_t stride_warped) { int64_t num_of_segments = *num_of_segments_ptr; +<<<<<<< HEAD const int64_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; const int64_t id = gid / stride_warped; const int64_t startFeature = gid % stride_warped; +======= + const int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int id = gid / stride_warped; + const int startFeature = gid % stride_warped; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (startFeature >= stride) { return; } @@ -132,9 +144,15 @@ __global__ void compute_grad_weight( int64_t num_of_segments = *num_of_segments_ptr; using accscalar_t = acc_type; +<<<<<<< HEAD const int64_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; const int64_t id = gid / stride_warped; const int64_t startFeature = gid % stride_warped; +======= + const int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int id = gid / stride_warped; + const int startFeature = gid % stride_warped; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (startFeature >= stride) { return; } @@ -165,9 +183,15 @@ __global__ void sum_and_scatter( int64_t num_of_segments = *num_of_segments_ptr; int64_t num_of_partial_segments = *num_of_partial_segments_ptr; +<<<<<<< HEAD const int64_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; const int64_t id = gid / stride_warped; const int64_t startFeature = gid % stride_warped; +======= + const int gid = blockIdx.x * blockDim.x + threadIdx.x; + const int id = gid / stride_warped; + const int startFeature = gid % stride_warped; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) if (startFeature >= stride) { return; } @@ -194,9 +218,24 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm partials_per_segment_offset[num_of_segments-1]; } +<<<<<<< HEAD + +} // anon namespace + +======= +#if !CUB_SUPPORTS_UNIQUE_BY_KEY() +__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) { + *num_of_segments_ptr = num_of_segments; +} +#endif } // anon namespace +#if !CUB_SUPPORTS_UNIQUE_BY_KEY() +template +int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) Tensor embedding_backward_cuda_kernel( const Tensor &grad, @@ -223,12 +262,26 @@ Tensor embedding_backward_cuda_kernel( auto segment_offsets = at::empty({numel}, orig_indices.options()); auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong)); int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr(); +<<<<<<< HEAD +======= +#if !CUB_SUPPORTS_UNIQUE_BY_KEY() + AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { + int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key(sorted_indices, segment_offsets); + write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); +#else +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { cuda::cub::unique_by_key( sorted_indices.const_data_ptr(), thrust::make_counting_iterator(0), segment_offsets.mutable_data_ptr(), num_of_segments_ptr, sorted_indices.numel()); }); +<<<<<<< HEAD +======= +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) int64_t max_segments = std::min(numel, num_weights); diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index ab3747df031eb..86c021c8c1435 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -31,10 +31,23 @@ #include +<<<<<<< HEAD #include namespace at::native { +======= +#if CUB_SUPPORTS_SCAN_BY_KEY() +#include +#endif + +namespace at::native { + +#if !CUB_SUPPORTS_SCAN_BY_KEY() +template +void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) namespace { @@ -193,6 +206,10 @@ Tensor embedding_bag_backward_cuda_sum_avg( if (scale_grad_by_freq) { count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); +<<<<<<< HEAD +======= +#if CUB_SUPPORTS_SCAN_BY_KEY() +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -203,7 +220,11 @@ Tensor embedding_bag_backward_cuda_sum_avg( auto count_data = count.mutable_data_ptr(); cuda::cub::inclusive_sum_by_key( sorted_data, +<<<<<<< HEAD ATEN_CUB_CONSTANT_ITERATOR(index_t)(1), +======= + NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::ConstantInputIterator(1), +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) count_data, num_indices ); @@ -215,10 +236,22 @@ Tensor embedding_bag_backward_cuda_sum_avg( thrust::make_reverse_iterator(sorted_data + num_indices), thrust::make_reverse_iterator(count_data + num_indices), thrust::make_reverse_iterator(count_data + num_indices), +<<<<<<< HEAD ATEN_CUB_MAXIMUM(), num_indices ); }); +======= + NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::Max(), + num_indices + ); + }); +#else + AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () { + embedding_dense_backward_cuda_scan(sorted_indices, count); + }); +#endif +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices, count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag, diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu index 227d42247ebd9..5b22b01893130 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu @@ -51,7 +51,11 @@ std::vector foreach_tensor_list_op( Op(), alpha.to()); +<<<<<<< HEAD return std::move(tensor_lists[2]); +======= + return tensor_lists[2]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu index 9ac0e875b2d68..cadc1490970ba 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu @@ -45,7 +45,11 @@ std::vector foreach_binary_op( /* res_arg_index */ 1>(), Op(), scalar.to()); +<<<<<<< HEAD return std::move(tensor_lists[1]); +======= + return tensor_lists[1]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu index b28aa690630b4..2fe7112dedfae 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu @@ -33,7 +33,11 @@ std::vector foreach_binary_op( } tensor_lists.emplace_back(tensors.vec()); +<<<<<<< HEAD tensor_lists.emplace_back(std::move(vec_res)); +======= + tensor_lists.emplace_back(vec_res); +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) using opmath_t = at::opmath_type; multi_tensor_apply<2, opmath_t>( @@ -46,7 +50,11 @@ std::vector foreach_binary_op( /* res_arg_index */ 1>(), Op()); +<<<<<<< HEAD return std::move(tensor_lists[1]); +======= + return tensor_lists[1]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu index bc6bd37891258..62696916cccda 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu @@ -56,7 +56,11 @@ std::vector foreach_binary_op( Op(), scalar.data_ptr(), alpha.to()); +<<<<<<< HEAD return std::move(tensor_lists[1]); +======= + return tensor_lists[1]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh index c121d971cd7be..a6938aeb94d24 100644 --- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh +++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh @@ -208,7 +208,11 @@ struct BinaryOpScalarFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op, opmath_t scalar) { @@ -232,7 +236,11 @@ struct BinaryOpScalarListFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListScalarListMetadata& tl, Op op) { const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; @@ -256,7 +264,11 @@ struct BinaryOpListAlphaFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op, opmath_t alpha) { @@ -308,7 +320,11 @@ struct BinaryOpScalarTensorFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op, T* scalar, @@ -364,7 +380,11 @@ struct BinaryOpScalarTensorFunctor { template struct ZeroFunctor { __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata<1>& tl) { const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; @@ -406,7 +426,11 @@ struct UnaryOpFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op) { const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; @@ -458,7 +482,11 @@ struct PointwiseOpScalarFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op, opmath_t scalar) { @@ -482,7 +510,11 @@ struct PointwiseOpScalarListFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListScalarListMetadata& tl, Op op) { const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; @@ -506,7 +538,11 @@ struct PointwiseOpListFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op) { const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; @@ -557,7 +593,11 @@ struct TernaryOpListFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op) { static_assert(depth == 3 || depth == 4, ""); @@ -611,7 +651,11 @@ struct TernaryOpScalarFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListMetadata& tl, Op op, opmath_t alpha) { @@ -668,7 +712,11 @@ struct TernaryOpScalarListFunctor { using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator()( +<<<<<<< HEAD int64_t chunk_size, +======= + int chunk_size, +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) TensorListScalarListMetadata& tl, Op op) { static_assert(depth == 2 || depth == 3, ""); diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu index 7f563f55d5565..4bd8e26ff7b50 100644 --- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu +++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu @@ -57,7 +57,11 @@ std::vector foreach_pointwise_op( scalar.to()); }); +<<<<<<< HEAD return std::move(tensor_lists[3]); +======= + return tensor_lists[3]; +>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791)) } template