Skip to content

Commit ee04104

Browse files
jerrymannilAMD AMD
authored andcommitted
Cherry-picked commit with merge conflict
1 parent a7b6c00 commit ee04104

File tree

6,086 files changed

+372693
-846
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

6,086 files changed

+372693
-846
lines changed

.automation_scripts/run_pytorch_unit_tests.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,11 @@ def run_test_and_summarize_results(
338338

339339
# copy current environment variables
340340
_environ = dict(os.environ)
341+
<<<<<<< HEAD
341342

343+
=======
344+
345+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
342346
# modify path
343347
test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
344348
test_run_test_path = pytorch_root_dir + "/test/run_test.py"
@@ -385,6 +389,13 @@ def run_test_and_summarize_results(
385389
global CONSOLIDATED_LOG_FILE_PATH
386390
CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
387391

392+
<<<<<<< HEAD
393+
=======
394+
# Check multi gpu availability if distributed tests are enabled
395+
if ("distributed" in test_config) or len(distributed_list) != 0:
396+
check_num_gpus_for_distributed()
397+
398+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
388399
# Install test requirements
389400
command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
390401
run_command_and_capture_output(command)
@@ -393,15 +404,23 @@ def run_test_and_summarize_results(
393404
if not priority_tests and not default_list and not distributed_list and not inductor_list:
394405
# run entire tests for default, distributed and inductor workflows → use test.sh
395406
if not test_config:
407+
<<<<<<< HEAD
408+
=======
409+
check_num_gpus_for_distributed()
410+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
396411
# default test process
397412
res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
398413
res_all_tests_dict["default"] = res_default_all
399414
# distributed test process
415+
<<<<<<< HEAD
400416
res_distributed_all = {}
401417
if is_multi_gpus_available_for_distributed():
402418
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
403419
else:
404420
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
421+
=======
422+
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
423+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
405424
res_all_tests_dict["distributed"] = res_distributed_all
406425
# inductor test process
407426
res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
@@ -414,27 +433,39 @@ def run_test_and_summarize_results(
414433
res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
415434
res_all_tests_dict["default"] = res_default_all
416435
if "distributed" in workflow_list:
436+
<<<<<<< HEAD
417437
res_distributed_all = {}
418438
if is_multi_gpus_available_for_distributed():
419439
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
420440
else:
421441
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
442+
=======
443+
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
444+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
422445
res_all_tests_dict["distributed"] = res_distributed_all
423446
if "inductor" in workflow_list:
424447
res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
425448
res_all_tests_dict["inductor"] = res_inductor_all
426449
# Run priority test for each workflow
427450
elif priority_tests and not default_list and not distributed_list and not inductor_list:
428451
if not test_config:
452+
<<<<<<< HEAD
453+
=======
454+
check_num_gpus_for_distributed()
455+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
429456
# default test process
430457
res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
431458
res_all_tests_dict["default"] = res_default_priority
432459
# distributed test process
460+
<<<<<<< HEAD
433461
res_distributed_priority = {}
434462
if is_multi_gpus_available_for_distributed():
435463
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
436464
else:
437465
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
466+
=======
467+
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
468+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
438469
res_all_tests_dict["distributed"] = res_distributed_priority
439470
# will not run inductor priority tests
440471
print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
@@ -446,11 +477,15 @@ def run_test_and_summarize_results(
446477
res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
447478
res_all_tests_dict["default"] = res_default_priority
448479
if "distributed" in workflow_list:
480+
<<<<<<< HEAD
449481
res_distributed_priority = {}
450482
if is_multi_gpus_available_for_distributed():
451483
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
452484
else:
453485
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
486+
=======
487+
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
488+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
454489
res_all_tests_dict["distributed"] = res_distributed_priority
455490
if "inductor" in workflow_list:
456491
print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
@@ -466,11 +501,15 @@ def run_test_and_summarize_results(
466501
distributed_workflow_list = []
467502
for item in distributed_list:
468503
distributed_workflow_list.append(item)
504+
<<<<<<< HEAD
469505
res_distributed_selected = {}
470506
if is_multi_gpus_available_for_distributed():
471507
res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
472508
else:
473509
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
510+
=======
511+
res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
512+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
474513
res_all_tests_dict["distributed"] = res_distributed_selected
475514
if inductor_list:
476515
inductor_workflow_list = []
@@ -518,10 +557,17 @@ def parse_args():
518557
"RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
519558
return parser.parse_args()
520559

560+
<<<<<<< HEAD
521561
def is_multi_gpus_available_for_distributed():
522562
p = subprocess.run("rocminfo | grep -cE 'Name:\\s+gfx'", shell=True, capture_output=True, text=True)
523563
num_gpus_visible = int(p.stdout)
524564
return num_gpus_visible > 1
565+
=======
566+
def check_num_gpus_for_distributed():
567+
p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
568+
num_gpus_visible = int(p.stdout)
569+
assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
570+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
525571

526572
def main():
527573
args = parse_args()

.ci/aarch64_linux/aarch64_ci_build.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,15 @@ set -eux -o pipefail
33

44
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
55

6+
<<<<<<< HEAD
67
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
78
export TORCH_CUDA_ARCH_LIST="9.0"
89
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
910
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
11+
=======
12+
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
13+
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
14+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
1015
fi
1116

1217
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -27,6 +32,10 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
2732
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
2833
else
2934
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
35+
<<<<<<< HEAD
36+
=======
37+
export USE_SYSTEM_NCCL=1
38+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
3039
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
3140
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
3241
fi

.ci/aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def build_ArmComputeLibrary() -> None:
3131
"build=native",
3232
]
3333
acl_install_dir = "/acl"
34+
<<<<<<< HEAD
3435
acl_checkout_dir = "ComputeLibrary"
3536
os.makedirs(acl_install_dir)
3637
check_call(
@@ -52,6 +53,30 @@ def build_ArmComputeLibrary() -> None:
5253
cwd=acl_checkout_dir,
5354
)
5455
for d in ["arm_compute", "include", "utils", "support", "src"]:
56+
=======
57+
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
58+
if os.path.isdir(acl_install_dir):
59+
shutil.rmtree(acl_install_dir)
60+
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
61+
check_call(
62+
[
63+
"git",
64+
"clone",
65+
"https://github.com/ARM-software/ComputeLibrary.git",
66+
"-b",
67+
"v25.02",
68+
"--depth",
69+
"1",
70+
"--shallow-submodules",
71+
]
72+
)
73+
74+
check_call(
75+
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
76+
cwd=acl_checkout_dir,
77+
)
78+
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
79+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
5580
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
5681

5782

@@ -87,7 +112,11 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
87112
"/usr/local/cuda/lib64/libcusparseLt.so.0",
88113
"/usr/local/cuda/lib64/libcusolver.so.11",
89114
"/usr/local/cuda/lib64/libcurand.so.10",
115+
<<<<<<< HEAD
90116
"/usr/local/cuda/lib64/libnvToolsExt.so.1",
117+
=======
118+
"/usr/local/cuda/lib64/libnccl.so.2",
119+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
91120
"/usr/local/cuda/lib64/libnvJitLink.so.12",
92121
"/usr/local/cuda/lib64/libnvrtc.so.12",
93122
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
@@ -107,9 +136,15 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
107136
"/usr/local/lib/libnvpl_blas_core.so.0",
108137
]
109138

139+
<<<<<<< HEAD
110140
if "128" in desired_cuda:
111141
libs_to_copy += [
112142
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
143+
=======
144+
if "129" in desired_cuda:
145+
libs_to_copy += [
146+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
147+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
113148
"/usr/local/cuda/lib64/libcufile.so.0",
114149
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
115150
]
@@ -203,8 +238,15 @@ def parse_arguments():
203238
).decode()
204239

205240
print("Building PyTorch wheel")
241+
<<<<<<< HEAD
206242
build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
207243
os.system("cd /pytorch; python setup.py clean")
244+
=======
245+
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
246+
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
247+
if enable_cuda:
248+
build_vars = "MAX_JOBS=5 " + build_vars
249+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
208250

209251
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
210252
desired_cuda = os.getenv("DESIRED_CUDA")

.ci/aarch64_linux/build_aarch64_wheel.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,19 @@
1919

2020
# AMI images for us-east-1, change the following based on your ~/.aws/config
2121
os_amis = {
22+
<<<<<<< HEAD
2223
"ubuntu18_04": "ami-078eece1d8119409f", # login_name: ubuntu
24+
=======
25+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
2326
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
2427
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
2528
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
2629
}
2730

31+
<<<<<<< HEAD
2832
ubuntu18_04_ami = os_amis["ubuntu18_04"]
33+
=======
34+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
2935
ubuntu20_04_ami = os_amis["ubuntu20_04"]
3036

3137

@@ -659,6 +665,7 @@ def configure_system(
659665
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
660666
)
661667
host.run_cmd("pip3 install dataclasses typing-extensions")
668+
<<<<<<< HEAD
662669
# Install and switch to gcc-8 on Ubuntu-18.04
663670
if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
664671
host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
@@ -671,6 +678,8 @@ def configure_system(
671678
host.run_cmd(
672679
"sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
673680
)
681+
=======
682+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
674683
if not use_conda:
675684
print("Installing Cython + numpy from PyPy")
676685
host.run_cmd("sudo pip3 install Cython")
@@ -1026,7 +1035,11 @@ def parse_arguments():
10261035
install_condaforge_python(host, args.python_version)
10271036
sys.exit(0)
10281037

1038+
<<<<<<< HEAD
10291039
python_version = args.python_version if args.python_version is not None else "3.8"
1040+
=======
1041+
python_version = args.python_version if args.python_version is not None else "3.9"
1042+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
10301043

10311044
if args.use_torch_from_pypi:
10321045
configure_system(host, compiler=args.compiler, python_version=python_version)

.ci/caffe2/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
1010
built on Jenkins and are used in triggered builds already have this
1111
environment variable set in their manifest. Also see
1212
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
13+
<<<<<<< HEAD
1314

1415
Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
16+
=======
17+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

.ci/caffe2/test.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,13 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
1313
echo 'Skipping tests'
1414
exit 0
1515
fi
16+
<<<<<<< HEAD
1617
if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
1718
# temporary to locate some kernel issues on the CI nodes
1819
export HSAKMT_DEBUG_LEVEL=4
1920
fi
21+
=======
22+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
2023
# These additional packages are needed for circleci ROCm builds.
2124
if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
2225
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by

.ci/docker/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,9 @@ See `build.sh` for valid build environments (it's the giant switch).
3434
./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
3535

3636
# Set flags (see build.sh) and build image
37+
<<<<<<< HEAD
3738
sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
39+
=======
40+
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
41+
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
3842
```

0 commit comments

Comments
 (0)