Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
46 changes: 46 additions & 0 deletions .automation_scripts/run_pytorch_unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,11 @@ def run_test_and_summarize_results(

# copy current environment variables
_environ = dict(os.environ)
<<<<<<< HEAD

=======

>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
# modify path
test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
test_run_test_path = pytorch_root_dir + "/test/run_test.py"
Expand Down Expand Up @@ -385,6 +389,13 @@ def run_test_and_summarize_results(
global CONSOLIDATED_LOG_FILE_PATH
CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME

<<<<<<< HEAD
=======
# Check multi gpu availability if distributed tests are enabled
if ("distributed" in test_config) or len(distributed_list) != 0:
check_num_gpus_for_distributed()

>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
# Install test requirements
command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
run_command_and_capture_output(command)
Expand All @@ -393,15 +404,23 @@ def run_test_and_summarize_results(
if not priority_tests and not default_list and not distributed_list and not inductor_list:
# run entire tests for default, distributed and inductor workflows → use test.sh
if not test_config:
<<<<<<< HEAD
=======
check_num_gpus_for_distributed()
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
# default test process
res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
res_all_tests_dict["default"] = res_default_all
# distributed test process
<<<<<<< HEAD
res_distributed_all = {}
if is_multi_gpus_available_for_distributed():
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
else:
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
=======
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
res_all_tests_dict["distributed"] = res_distributed_all
# inductor test process
res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
Expand All @@ -414,27 +433,39 @@ def run_test_and_summarize_results(
res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
res_all_tests_dict["default"] = res_default_all
if "distributed" in workflow_list:
<<<<<<< HEAD
res_distributed_all = {}
if is_multi_gpus_available_for_distributed():
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
else:
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
=======
res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
res_all_tests_dict["distributed"] = res_distributed_all
if "inductor" in workflow_list:
res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
res_all_tests_dict["inductor"] = res_inductor_all
# Run priority test for each workflow
elif priority_tests and not default_list and not distributed_list and not inductor_list:
if not test_config:
<<<<<<< HEAD
=======
check_num_gpus_for_distributed()
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
# default test process
res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
res_all_tests_dict["default"] = res_default_priority
# distributed test process
<<<<<<< HEAD
res_distributed_priority = {}
if is_multi_gpus_available_for_distributed():
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
else:
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
=======
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
res_all_tests_dict["distributed"] = res_distributed_priority
# will not run inductor priority tests
print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
Expand All @@ -446,11 +477,15 @@ def run_test_and_summarize_results(
res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
res_all_tests_dict["default"] = res_default_priority
if "distributed" in workflow_list:
<<<<<<< HEAD
res_distributed_priority = {}
if is_multi_gpus_available_for_distributed():
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
else:
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
=======
res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
res_all_tests_dict["distributed"] = res_distributed_priority
if "inductor" in workflow_list:
print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
Expand All @@ -466,11 +501,15 @@ def run_test_and_summarize_results(
distributed_workflow_list = []
for item in distributed_list:
distributed_workflow_list.append(item)
<<<<<<< HEAD
res_distributed_selected = {}
if is_multi_gpus_available_for_distributed():
res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
else:
print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
=======
res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
res_all_tests_dict["distributed"] = res_distributed_selected
if inductor_list:
inductor_workflow_list = []
Expand Down Expand Up @@ -518,10 +557,17 @@ def parse_args():
"RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
return parser.parse_args()

<<<<<<< HEAD
def is_multi_gpus_available_for_distributed():
p = subprocess.run("rocminfo | grep -cE 'Name:\\s+gfx'", shell=True, capture_output=True, text=True)
num_gpus_visible = int(p.stdout)
return num_gpus_visible > 1
=======
def check_num_gpus_for_distributed():
p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
num_gpus_visible = int(p.stdout)
assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

def main():
args = parse_args()
Expand Down
9 changes: 9 additions & 0 deletions .ci/aarch64_linux/aarch64_ci_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ set -eux -o pipefail

GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

<<<<<<< HEAD
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
export TORCH_CUDA_ARCH_LIST="9.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
=======
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
fi

SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
Expand All @@ -27,6 +32,10 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
<<<<<<< HEAD
=======
export USE_SYSTEM_NCCL=1
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi
42 changes: 42 additions & 0 deletions .ci/aarch64_linux/aarch64_wheel_ci_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def build_ArmComputeLibrary() -> None:
"build=native",
]
acl_install_dir = "/acl"
<<<<<<< HEAD
acl_checkout_dir = "ComputeLibrary"
os.makedirs(acl_install_dir)
check_call(
Expand All @@ -52,6 +53,30 @@ def build_ArmComputeLibrary() -> None:
cwd=acl_checkout_dir,
)
for d in ["arm_compute", "include", "utils", "support", "src"]:
=======
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
if os.path.isdir(acl_install_dir):
shutil.rmtree(acl_install_dir)
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
check_call(
[
"git",
"clone",
"https://github.com/ARM-software/ComputeLibrary.git",
"-b",
"v25.02",
"--depth",
"1",
"--shallow-submodules",
]
)

check_call(
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
cwd=acl_checkout_dir,
)
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")


Expand Down Expand Up @@ -87,7 +112,11 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libcusparseLt.so.0",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libcurand.so.10",
<<<<<<< HEAD
"/usr/local/cuda/lib64/libnvToolsExt.so.1",
=======
"/usr/local/cuda/lib64/libnccl.so.2",
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
Expand All @@ -107,9 +136,15 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/lib/libnvpl_blas_core.so.0",
]

<<<<<<< HEAD
if "128" in desired_cuda:
libs_to_copy += [
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
=======
if "129" in desired_cuda:
libs_to_copy += [
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
]
Expand Down Expand Up @@ -203,8 +238,15 @@ def parse_arguments():
).decode()

print("Building PyTorch wheel")
<<<<<<< HEAD
build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
os.system("cd /pytorch; python setup.py clean")
=======
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars = "MAX_JOBS=5 " + build_vars
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
desired_cuda = os.getenv("DESIRED_CUDA")
Expand Down
13 changes: 13 additions & 0 deletions .ci/aarch64_linux/build_aarch64_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,19 @@

# AMI images for us-east-1, change the following based on your ~/.aws/config
os_amis = {
<<<<<<< HEAD
"ubuntu18_04": "ami-078eece1d8119409f", # login_name: ubuntu
=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
}

<<<<<<< HEAD
ubuntu18_04_ami = os_amis["ubuntu18_04"]
=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
ubuntu20_04_ami = os_amis["ubuntu20_04"]


Expand Down Expand Up @@ -659,6 +665,7 @@ def configure_system(
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
)
host.run_cmd("pip3 install dataclasses typing-extensions")
<<<<<<< HEAD
# Install and switch to gcc-8 on Ubuntu-18.04
if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
Expand All @@ -671,6 +678,8 @@ def configure_system(
host.run_cmd(
"sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
)
=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
if not use_conda:
print("Installing Cython + numpy from PyPy")
host.run_cmd("sudo pip3 install Cython")
Expand Down Expand Up @@ -1026,7 +1035,11 @@ def parse_arguments():
install_condaforge_python(host, args.python_version)
sys.exit(0)

<<<<<<< HEAD
python_version = args.python_version if args.python_version is not None else "3.8"
=======
python_version = args.python_version if args.python_version is not None else "3.9"
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))

if args.use_torch_from_pypi:
configure_system(host, compiler=args.compiler, python_version=python_version)
Expand Down
3 changes: 3 additions & 0 deletions .ci/caffe2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
built on Jenkins and are used in triggered builds already have this
environment variable set in their manifest. Also see
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
<<<<<<< HEAD

Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
3 changes: 3 additions & 0 deletions .ci/caffe2/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
echo 'Skipping tests'
exit 0
fi
<<<<<<< HEAD
if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
# temporary to locate some kernel issues on the CI nodes
export HSAKMT_DEBUG_LEVEL=4
fi
=======
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
# These additional packages are needed for circleci ROCm builds.
if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
Expand Down
4 changes: 4 additions & 0 deletions .ci/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,9 @@ See `build.sh` for valid build environments (it's the giant switch).
./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest

# Set flags (see build.sh) and build image
<<<<<<< HEAD
sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
=======
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
```
Loading