Skip to content

Commit 4075646

Browse files
jerrymannilAMD AMD
authored andcommitted
Cherry-picked commit with merge conflict
1 parent 29e3779 commit 4075646

File tree

4,524 files changed

+308302
-66
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

4,524 files changed

+308302
-66
lines changed

.bazelrc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@ build --cxxopt=--std=c++17
22
build --copt=-I.
33
# Bazel does not support including its cc_library targets as system
44
# headers. We work around this for generated code
5+
<<<<<<< HEAD
56
# (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a
7+
=======
8+
# (e.g. c10/macros/cmake_macros.h) by making the generated directory a
9+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
610
# system include path.
711
build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
812
build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin

.ci/aarch64_linux/aarch64_ci_build.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set -eux -o pipefail
33

44
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
55

6+
<<<<<<< HEAD
67
# Set CUDA architecture lists to match x86 build_cuda.sh
78
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
89
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
@@ -19,6 +20,10 @@ if [[ "$DESIRED_CUDA" == *"13"* ]]; then
1920
export TORCH_NVCC_FLAGS="-compress-mode=size"
2021
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
2122
export BUILD_BUNDLE_PTXAS=1
23+
=======
24+
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
25+
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
26+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
2227
fi
2328

2429
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -32,6 +37,7 @@ cd /
3237
# on the mounted pytorch repo
3338
git config --global --add safe.directory /pytorch
3439
pip install -r /pytorch/requirements.txt
40+
<<<<<<< HEAD
3541
pip install auditwheel==6.2.0 wheel
3642
if [ "$DESIRED_CUDA" = "cpu" ]; then
3743
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
@@ -50,4 +56,16 @@ else
5056
fi
5157

5258
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
59+
=======
60+
pip install auditwheel==6.2.0
61+
if [ "$DESIRED_CUDA" = "cpu" ]; then
62+
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
63+
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
64+
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
65+
else
66+
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
67+
export USE_SYSTEM_NCCL=1
68+
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
69+
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
70+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
5371
fi

.ci/aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,52 @@ def list_dir(path: str) -> list[str]:
1313
return check_output(["ls", "-1", path]).decode().split("\n")
1414

1515

16+
<<<<<<< HEAD
17+
=======
18+
def build_ArmComputeLibrary() -> None:
19+
"""
20+
Using ArmComputeLibrary for aarch64 PyTorch
21+
"""
22+
print("Building Arm Compute Library")
23+
acl_build_flags = [
24+
"debug=0",
25+
"neon=1",
26+
"opencl=0",
27+
"os=linux",
28+
"openmp=1",
29+
"cppthreads=0",
30+
"arch=armv8a",
31+
"multi_isa=1",
32+
"fixed_format_kernels=1",
33+
"build=native",
34+
]
35+
acl_install_dir = "/acl"
36+
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
37+
if os.path.isdir(acl_install_dir):
38+
shutil.rmtree(acl_install_dir)
39+
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
40+
check_call(
41+
[
42+
"git",
43+
"clone",
44+
"https://github.com/ARM-software/ComputeLibrary.git",
45+
"-b",
46+
"v25.02",
47+
"--depth",
48+
"1",
49+
"--shallow-submodules",
50+
]
51+
)
52+
53+
check_call(
54+
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
55+
cwd=acl_checkout_dir,
56+
)
57+
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
58+
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
59+
60+
61+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
1662
def replace_tag(filename) -> None:
1763
with open(filename) as f:
1864
lines = f.readlines()
@@ -26,6 +72,7 @@ def replace_tag(filename) -> None:
2672
f.writelines(lines)
2773

2874

75+
<<<<<<< HEAD
2976
def patch_library_rpath(
3077
folder: str,
3178
lib_name: str,
@@ -88,11 +135,14 @@ def copy_and_patch_library(
88135
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
89136

90137

138+
=======
139+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
91140
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
92141
"""
93142
Package the cuda wheel libraries
94143
"""
95144
folder = os.path.dirname(wheel_path)
145+
<<<<<<< HEAD
96146
os.mkdir(f"{folder}/tmp")
97147
os.system(f"unzip {wheel_path} -d {folder}/tmp")
98148
# Delete original wheel since it will be repackaged
@@ -206,15 +256,77 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
206256
# Copy libraries to unzipped_folder/torch/lib
207257
for lib_path in libs_to_copy:
208258
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
259+
=======
260+
wheelname = os.path.basename(wheel_path)
261+
os.mkdir(f"{folder}/tmp")
262+
os.system(f"unzip {wheel_path} -d {folder}/tmp")
263+
libs_to_copy = [
264+
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
265+
"/usr/local/cuda/lib64/libcudnn.so.9",
266+
"/usr/local/cuda/lib64/libcublas.so.12",
267+
"/usr/local/cuda/lib64/libcublasLt.so.12",
268+
"/usr/local/cuda/lib64/libcudart.so.12",
269+
"/usr/local/cuda/lib64/libcufft.so.11",
270+
"/usr/local/cuda/lib64/libcusparse.so.12",
271+
"/usr/local/cuda/lib64/libcusparseLt.so.0",
272+
"/usr/local/cuda/lib64/libcusolver.so.11",
273+
"/usr/local/cuda/lib64/libcurand.so.10",
274+
"/usr/local/cuda/lib64/libnccl.so.2",
275+
"/usr/local/cuda/lib64/libnvJitLink.so.12",
276+
"/usr/local/cuda/lib64/libnvrtc.so.12",
277+
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
278+
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
279+
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
280+
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
281+
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
282+
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
283+
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
284+
"/lib64/libgomp.so.1",
285+
"/usr/lib64/libgfortran.so.5",
286+
"/acl/build/libarm_compute.so",
287+
"/acl/build/libarm_compute_graph.so",
288+
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
289+
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
290+
"/usr/local/lib/libnvpl_lapack_core.so.0",
291+
"/usr/local/lib/libnvpl_blas_core.so.0",
292+
]
293+
294+
if "129" in desired_cuda:
295+
libs_to_copy += [
296+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
297+
"/usr/local/cuda/lib64/libcufile.so.0",
298+
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
299+
]
300+
301+
# Copy libraries to unzipped_folder/a/lib
302+
for lib_path in libs_to_copy:
303+
lib_name = os.path.basename(lib_path)
304+
shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
305+
os.system(
306+
f"cd {folder}/tmp/torch/lib/; "
307+
f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
308+
)
309+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
209310

210311
# Make sure the wheel is tagged with manylinux_2_28
211312
for f in os.scandir(f"{folder}/tmp/"):
212313
if f.is_dir() and f.name.endswith(".dist-info"):
213314
replace_tag(f"{f.path}/WHEEL")
214315
break
215316

317+
<<<<<<< HEAD
216318
os.system(f"wheel pack {folder}/tmp/ -d {folder}")
217319
os.system(f"rm -rf {folder}/tmp/")
320+
=======
321+
os.mkdir(f"{folder}/cuda_wheel")
322+
os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
323+
shutil.move(
324+
f"{folder}/cuda_wheel/{wheelname}",
325+
f"{folder}/{wheelname}",
326+
copy_function=shutil.copy2,
327+
)
328+
os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
329+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
218330

219331

220332
def complete_wheel(folder: str) -> str:
@@ -237,7 +349,18 @@ def complete_wheel(folder: str) -> str:
237349
f"/{folder}/dist/{repaired_wheel_name}",
238350
)
239351
else:
352+
<<<<<<< HEAD
240353
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
354+
=======
355+
repaired_wheel_name = wheel_name.replace(
356+
"linux_aarch64", "manylinux_2_28_aarch64"
357+
)
358+
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
359+
os.rename(
360+
f"/{folder}/dist/{wheel_name}",
361+
f"/{folder}/dist/{repaired_wheel_name}",
362+
)
363+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
241364

242365
print(f"Copying {repaired_wheel_name} to artifacts")
243366
shutil.copy2(
@@ -274,6 +397,7 @@ def parse_arguments():
274397
).decode()
275398

276399
print("Building PyTorch wheel")
400+
<<<<<<< HEAD
277401
build_vars = ""
278402
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
279403
if enable_cuda:
@@ -288,6 +412,12 @@ def parse_arguments():
288412
else:
289413
print("Configuring build for bundled NVIDIA libraries")
290414
# Keep existing static linking approach - already configured above
415+
=======
416+
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
417+
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
418+
if enable_cuda:
419+
build_vars = "MAX_JOBS=5 " + build_vars
420+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
291421

292422
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
293423
desired_cuda = os.getenv("DESIRED_CUDA")
@@ -313,6 +443,7 @@ def parse_arguments():
313443
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
314444

315445
if enable_mkldnn:
446+
<<<<<<< HEAD
316447
print("build pytorch with mkldnn+acl backend")
317448
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
318449
build_vars += "ACL_ROOT_DIR=/acl "
@@ -324,6 +455,25 @@ def parse_arguments():
324455
print("build pytorch without mkldnn backend")
325456

326457
os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
458+
=======
459+
build_ArmComputeLibrary()
460+
print("build pytorch with mkldnn+acl backend")
461+
build_vars += (
462+
"USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
463+
"ACL_ROOT_DIR=/acl "
464+
"LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
465+
"ACL_INCLUDE_DIR=/acl/build "
466+
"ACL_LIBRARY=/acl/build "
467+
)
468+
if enable_cuda:
469+
build_vars += "BLAS=NVPL "
470+
else:
471+
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
472+
else:
473+
print("build pytorch without mkldnn backend")
474+
475+
os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
476+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
327477
if enable_cuda:
328478
print("Updating Cuda Dependency")
329479
filename = os.listdir("/pytorch/dist/")

0 commit comments

Comments
 (0)