ROCm · rocm-repo-management-api · Nov 7, 2025
diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
@@ -338,7 +338,11 @@ def run_test_and_summarize_results(
 
     # copy current environment variables
     _environ = dict(os.environ)
+<<<<<<< HEAD
 
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # modify path
     test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
     test_run_test_path = pytorch_root_dir + "/test/run_test.py"
@@ -385,6 +389,13 @@ def run_test_and_summarize_results(
     global CONSOLIDATED_LOG_FILE_PATH
     CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
 
+<<<<<<< HEAD
+=======
+    # Check multi gpu availability if distributed tests are enabled
+    if ("distributed" in test_config) or len(distributed_list) != 0:
+        check_num_gpus_for_distributed()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Install test requirements
     command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
     run_command_and_capture_output(command)
@@ -393,15 +404,23 @@ def run_test_and_summarize_results(
     if not priority_tests and not default_list and not distributed_list and not inductor_list:
         # run entire tests for default, distributed and inductor workflows → use test.sh
         if not test_config:
+<<<<<<< HEAD
+=======
+            check_num_gpus_for_distributed()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # default test process
             res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
             res_all_tests_dict["default"] = res_default_all
             # distributed test process
+<<<<<<< HEAD
             res_distributed_all = {}
             if is_multi_gpus_available_for_distributed():
                 res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
             else:
                 print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+            res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_all_tests_dict["distributed"] = res_distributed_all
             # inductor test process
             res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
@@ -414,27 +433,39 @@ def run_test_and_summarize_results(
                 res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
                 res_all_tests_dict["default"] = res_default_all
             if "distributed" in workflow_list:
+<<<<<<< HEAD
                 res_distributed_all = {}
                 if is_multi_gpus_available_for_distributed():
                     res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
                 else:
                     print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+                res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res_all_tests_dict["distributed"] = res_distributed_all
             if "inductor" in workflow_list:
                 res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
                 res_all_tests_dict["inductor"] = res_inductor_all
     # Run priority test for each workflow
     elif priority_tests and not default_list and not distributed_list and not inductor_list:
         if not test_config:
+<<<<<<< HEAD
+=======
+            check_num_gpus_for_distributed()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # default test process
             res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
             res_all_tests_dict["default"] = res_default_priority
             # distributed test process
+<<<<<<< HEAD
             res_distributed_priority = {}
             if is_multi_gpus_available_for_distributed():
                 res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
             else:
                 print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+            res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_all_tests_dict["distributed"] = res_distributed_priority
             # will not run inductor priority tests
             print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
@@ -446,11 +477,15 @@ def run_test_and_summarize_results(
                 res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
                 res_all_tests_dict["default"] = res_default_priority
             if "distributed" in workflow_list:
+<<<<<<< HEAD
                 res_distributed_priority = {}
                 if is_multi_gpus_available_for_distributed():
                     res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
                 else:
                     print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+                res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res_all_tests_dict["distributed"] = res_distributed_priority
             if "inductor" in workflow_list:
                 print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
@@ -466,11 +501,15 @@ def run_test_and_summarize_results(
             distributed_workflow_list = []
             for item in distributed_list:
                 distributed_workflow_list.append(item)
+<<<<<<< HEAD
             res_distributed_selected = {}
             if is_multi_gpus_available_for_distributed():
                 res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
             else:
                 print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+            res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_all_tests_dict["distributed"] = res_distributed_selected
         if inductor_list:
             inductor_workflow_list = []
@@ -518,10 +557,17 @@ def parse_args():
                                                             "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
     return parser.parse_args()
 
+<<<<<<< HEAD
 def is_multi_gpus_available_for_distributed():
     p = subprocess.run("rocminfo | grep -cE 'Name:\\s+gfx'", shell=True, capture_output=True, text=True)
     num_gpus_visible = int(p.stdout)
     return num_gpus_visible > 1
+=======
+def check_num_gpus_for_distributed():
+    p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+    num_gpus_visible = int(p.stdout)
+    assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def main():
     args = parse_args()

diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -3,10 +3,15 @@ set -eux -o pipefail
 
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
+<<<<<<< HEAD
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
     export TORCH_CUDA_ARCH_LIST="9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
     export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
+=======
+if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -27,6 +32,10 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+<<<<<<< HEAD
+=======
+    export USE_SYSTEM_NCCL=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -31,6 +31,7 @@ def build_ArmComputeLibrary() -> None:
         "build=native",
     ]
     acl_install_dir = "/acl"
+<<<<<<< HEAD
     acl_checkout_dir = "ComputeLibrary"
     os.makedirs(acl_install_dir)
     check_call(
@@ -52,6 +53,30 @@ def build_ArmComputeLibrary() -> None:
         cwd=acl_checkout_dir,
     )
     for d in ["arm_compute", "include", "utils", "support", "src"]:
+=======
+    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
+    if os.path.isdir(acl_install_dir):
+        shutil.rmtree(acl_install_dir)
+    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
+        check_call(
+            [
+                "git",
+                "clone",
+                "https://github.com/ARM-software/ComputeLibrary.git",
+                "-b",
+                "v25.02",
+                "--depth",
+                "1",
+                "--shallow-submodules",
+            ]
+        )
+
+    check_call(
+        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        cwd=acl_checkout_dir,
+    )
+    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
 
 
@@ -87,7 +112,11 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         "/usr/local/cuda/lib64/libcusparseLt.so.0",
         "/usr/local/cuda/lib64/libcusolver.so.11",
         "/usr/local/cuda/lib64/libcurand.so.10",
+<<<<<<< HEAD
         "/usr/local/cuda/lib64/libnvToolsExt.so.1",
+=======
+        "/usr/local/cuda/lib64/libnccl.so.2",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "/usr/local/cuda/lib64/libnvJitLink.so.12",
         "/usr/local/cuda/lib64/libnvrtc.so.12",
         "/usr/local/cuda/lib64/libcudnn_adv.so.9",
@@ -107,9 +136,15 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         "/usr/local/lib/libnvpl_blas_core.so.0",
     ]
 
+<<<<<<< HEAD
     if "128" in desired_cuda:
         libs_to_copy += [
             "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+=======
+    if "129" in desired_cuda:
+        libs_to_copy += [
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "/usr/local/cuda/lib64/libcufile.so.0",
             "/usr/local/cuda/lib64/libcufile_rdma.so.1",
         ]
@@ -203,8 +238,15 @@ def parse_arguments():
     ).decode()
 
     print("Building PyTorch wheel")
+<<<<<<< HEAD
     build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
     os.system("cd /pytorch; python setup.py clean")
+=======
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars = "MAX_JOBS=5 " + build_vars
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
     desired_cuda = os.getenv("DESIRED_CUDA")

diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -19,13 +19,19 @@
 
 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+<<<<<<< HEAD
     "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
     "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
     "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }
 
+<<<<<<< HEAD
 ubuntu18_04_ami = os_amis["ubuntu18_04"]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ubuntu20_04_ami = os_amis["ubuntu20_04"]
 
 
@@ -659,6 +665,7 @@ def configure_system(
             "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
         )
     host.run_cmd("pip3 install dataclasses typing-extensions")
+<<<<<<< HEAD
     # Install and switch to gcc-8 on Ubuntu-18.04
     if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
         host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
@@ -671,6 +678,8 @@ def configure_system(
         host.run_cmd(
             "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not use_conda:
         print("Installing Cython + numpy from PyPy")
         host.run_cmd("sudo pip3 install Cython")
@@ -1026,7 +1035,11 @@ def parse_arguments():
         install_condaforge_python(host, args.python_version)
         sys.exit(0)
 
+<<<<<<< HEAD
     python_version = args.python_version if args.python_version is not None else "3.8"
+=======
+    python_version = args.python_version if args.python_version is not None else "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if args.use_torch_from_pypi:
         configure_system(host, compiler=args.compiler, python_version=python_version)

diff --git a/.ci/caffe2/README.md b/.ci/caffe2/README.md
@@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+<<<<<<< HEAD
 
 Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/caffe2/test.sh b/.ci/caffe2/test.sh
@@ -13,10 +13,13 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
   echo 'Skipping tests'
   exit 0
 fi
+<<<<<<< HEAD
 if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
   # temporary to locate some kernel issues on the CI nodes
   export HSAKMT_DEBUG_LEVEL=4
 fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # These additional packages are needed for circleci ROCm builds.
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
     # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by

@@ -34,5 +34,9 @@ See `build.sh` for valid build environments (it's the giant switch).
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 
 # Set flags (see build.sh) and build image
+<<<<<<< HEAD
 sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+=======
+sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```