Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
4 changes: 4 additions & 0 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ build --cxxopt=--std=c++17
build --copt=-I.
# Bazel does not support including its cc_library targets as system
# headers. We work around this for generated code
<<<<<<< HEAD
# (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a
=======
# (e.g. c10/macros/cmake_macros.h) by making the generated directory a
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
# system include path.
build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin
Expand Down
12 changes: 12 additions & 0 deletions .ci/aarch64_linux/aarch64_ci_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ set -eux -o pipefail

GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

<<<<<<< HEAD
# Set CUDA architecture lists to match x86 build_cuda.sh
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
Expand All @@ -17,6 +18,10 @@ if [[ "$DESIRED_CUDA" == *"13"* ]]; then
export TORCH_NVCC_FLAGS="-compress-mode=size"
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
export BUILD_BUNDLE_PTXAS=1
=======
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
fi

SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
Expand All @@ -30,14 +35,19 @@ cd /
# on the mounted pytorch repo
git config --global --add safe.directory /pytorch
pip install -r /pytorch/requirements.txt
<<<<<<< HEAD
pip install auditwheel==6.2.0 wheel
=======
pip install auditwheel==6.2.0
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
<<<<<<< HEAD

# Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
Expand All @@ -48,6 +58,8 @@ else
export USE_NVIDIA_PYPI_LIBS=1
fi

=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi
81 changes: 81 additions & 0 deletions .ci/aarch64_linux/aarch64_wheel_ci_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def replace_tag(filename) -> None:
f.writelines(lines)


<<<<<<< HEAD
def patch_library_rpath(
folder: str,
lib_name: str,
Expand Down Expand Up @@ -131,11 +132,14 @@ def copy_and_patch_library(
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)


=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"""
Package the cuda wheel libraries
"""
folder = os.path.dirname(wheel_path)
<<<<<<< HEAD
os.mkdir(f"{folder}/tmp")
os.system(f"unzip {wheel_path} -d {folder}/tmp")
# Delete original wheel since it will be repackaged
Expand Down Expand Up @@ -249,15 +253,77 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
# Copy libraries to unzipped_folder/torch/lib
for lib_path in libs_to_copy:
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
=======
wheelname = os.path.basename(wheel_path)
os.mkdir(f"{folder}/tmp")
os.system(f"unzip {wheel_path} -d {folder}/tmp")
libs_to_copy = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
"/usr/local/cuda/lib64/libcudnn.so.9",
"/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12",
"/usr/local/cuda/lib64/libcudart.so.12",
"/usr/local/cuda/lib64/libcufft.so.11",
"/usr/local/cuda/lib64/libcusparse.so.12",
"/usr/local/cuda/lib64/libcusparseLt.so.0",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libcurand.so.10",
"/usr/local/cuda/lib64/libnccl.so.2",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
]

if "129" in desired_cuda:
libs_to_copy += [
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
]

# Copy libraries to unzipped_folder/a/lib
for lib_path in libs_to_copy:
lib_name = os.path.basename(lib_path)
shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
os.system(
f"cd {folder}/tmp/torch/lib/; "
f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

# Make sure the wheel is tagged with manylinux_2_28
for f in os.scandir(f"{folder}/tmp/"):
if f.is_dir() and f.name.endswith(".dist-info"):
replace_tag(f"{f.path}/WHEEL")
break

<<<<<<< HEAD
os.system(f"wheel pack {folder}/tmp/ -d {folder}")
os.system(f"rm -rf {folder}/tmp/")
=======
os.mkdir(f"{folder}/cuda_wheel")
os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
shutil.move(
f"{folder}/cuda_wheel/{wheelname}",
f"{folder}/{wheelname}",
copy_function=shutil.copy2,
)
os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))


def complete_wheel(folder: str) -> str:
Expand All @@ -280,7 +346,18 @@ def complete_wheel(folder: str) -> str:
f"/{folder}/dist/{repaired_wheel_name}",
)
else:
<<<<<<< HEAD
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
=======
repaired_wheel_name = wheel_name.replace(
"linux_aarch64", "manylinux_2_28_aarch64"
)
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
os.rename(
f"/{folder}/dist/{wheel_name}",
f"/{folder}/dist/{repaired_wheel_name}",
)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

print(f"Copying {repaired_wheel_name} to artifacts")
shutil.copy2(
Expand Down Expand Up @@ -320,6 +397,7 @@ def parse_arguments():
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
<<<<<<< HEAD
build_vars += "MAX_JOBS=5 "

# Handle PyPI NVIDIA libraries vs bundled libraries
Expand All @@ -331,6 +409,9 @@ def parse_arguments():
else:
print("Configuring build for bundled NVIDIA libraries")
# Keep existing static linking approach - already configured above
=======
build_vars = "MAX_JOBS=5 " + build_vars
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
desired_cuda = os.getenv("DESIRED_CUDA")
Expand Down
24 changes: 24 additions & 0 deletions .ci/aarch64_linux/build_aarch64_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,13 @@ def build_torchvision(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
<<<<<<< HEAD
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
=======
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

Expand Down Expand Up @@ -493,7 +499,13 @@ def build_torchdata(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
<<<<<<< HEAD
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
=======
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

Expand Down Expand Up @@ -549,7 +561,13 @@ def build_torchtext(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
<<<<<<< HEAD
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
=======
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

Expand Down Expand Up @@ -607,7 +625,13 @@ def build_torchaudio(
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
<<<<<<< HEAD
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
=======
build_vars += (
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

Expand Down
3 changes: 3 additions & 0 deletions .ci/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ See `build.sh` for valid build environments (it's the giant switch).
# Set flags (see build.sh) and build image
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
```
<<<<<<< HEAD

## [Guidance] Adding a New Base Docker Image

Expand Down Expand Up @@ -137,3 +138,5 @@ If your new Docker image needs a library installed from a specific pinned commit

The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
pinned commit updates.
=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
10 changes: 10 additions & 0 deletions .ci/docker/almalinux/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,13 @@ FROM cuda as cuda12.9
RUN bash ./install_cuda.sh 12.9
ENV DESIRED_CUDA=12.9

<<<<<<< HEAD
FROM cuda as cuda13.0
RUN bash ./install_cuda.sh 13.0
ENV DESIRED_CUDA=13.0

=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
FROM ${ROCM_IMAGE} as rocm
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
ADD ./common/install_mkl.sh install_mkl.sh
Expand All @@ -80,10 +83,17 @@ ADD ./common/install_mnist.sh install_mnist.sh
RUN bash ./install_mnist.sh

FROM base as all_cuda
<<<<<<< HEAD
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8
COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9
COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
=======
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8
COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

# Final step
FROM ${BASE_TARGET} as final
Expand Down
Loading